diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 000000000..4fb53adaa
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,7 @@
+# Summary (Write a short headline summary of PR)
+
+- This PR is a (refactoring, bugfix, feature, something else)
+- It does the following (modify list as needed):
+  - Modifies/refactors (class or method) (how?)
+  - Fixes (issue number(s))
+  - Adds (specific feature) at the request of (project or person)
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 000000000..942920275
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,107 @@
+###############################################################################
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+# and RAJA Performance Suite project contributors.
+# See the RAJAPerf/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+###############################################################################
+# General GitLab pipelines configurations for supercomputers and Linux clusters
+# at Lawrence Livermore National Laboratory (LLNL).
+#
+# This entire pipeline is LLNL-specific
+# #############################################################################
+
+# We define the following GitLab pipeline variables:
+#
+# GIT_SUBMODULE_STRATEGY:
+# Tells Gitlab to recursively update the submodules when cloning umpire
+#
+# ALLOC_NAME:
+# On LLNL's ruby, this pipeline creates only one allocation shared among jobs
+# in order to save time and resources. This allocation has to be uniquely named
+# so that we are sure to retrieve it.
+#
+# BUILD_ROOT:
+# The path to the shared resources between all jobs. The BUILD_ROOT is unique to
+# the pipeline, preventing any form of concurrency with other pipelines. This
+# also means that the BUILD_ROOT directory will never be cleaned.
+#
+# DEFAULT_TIME:
+# Default time to let the Lassen jobs run will be 30 minutes. However, if it is
+# a job that requires more time, it will be overwritten in the lassen template 
+# file.
+# TODO: add a clean-up mechanism
+
+variables:
+  GIT_SUBMODULE_STRATEGY: recursive
+  ALLOC_NAME: ${CI_PROJECT_NAME}_ci_${CI_PIPELINE_ID}
+  BUILD_ROOT: ${CI_PROJECT_DIR}
+  DEFAULT_TIME: 30
+  MP_BRANCH: "develop"
+
+# Normally, stages are blocking in Gitlab. However, using the keyword "needs" we
+# can express dependencies between job that break the ordering of stages, in
+# favor of a DAG.
+# In practice r_*, l_* and b_* stages are independently run and start immediately.
+
+stages:
+  - r_allocate_resources
+  - r_build_and_test
+  - r_release_resources
+  - l_build_and_test
+  - c_allocate_resources
+  - c_build_and_test
+  - c_release_resources
+
+# This is the rules that drives the activation of "advanced" jobs. All advanced
+# jobs will share this through a template mechanism.
+.advanced_pipeline:
+  rules:
+    - if: '$CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "develop" || $ALL_TARGETS == "ON"' #run only if ...
+
+# These are also templates (.name) that define project specific build commands.
+# If an allocation exist with the name defined in this pipeline, the job will
+# use it (slurm specific).
+.build_toss_3_x86_64_ib_script:
+  script:
+    - echo ${ALLOC_NAME}
+    - export JOBID=$(squeue -h --name=${ALLOC_NAME} --format=%A)
+    - echo ${JOBID}
+    - srun $( [[ -n "${JOBID}" ]] && echo "--jobid=${JOBID}" ) -t ${DEFAULT_TIME} -N 1 scripts/gitlab/build_and_test.sh
+  artifacts:
+    reports:
+      junit: junit.xml
+
+.build_toss_4_x86_64_ib_corona_script:
+  script:
+    - srun -p pbatch -t 30 -N 1 scripts/gitlab/build_and_test.sh
+
+# Lassen and Butte use a different job scheduler (spectrum lsf) that does not
+# allow pre-allocation the same way slurm does.
+.build_blueos_3_ppc64le_ib_script:
+  script:
+    - lalloc 1 -W ${DEFAULT_TIME} scripts/gitlab/build_and_test.sh
+  artifacts:
+    reports:
+      junit: junit.xml
+
+.build_blueos_3_ppc64le_ib_ats_disabled_script:
+  script:
+    - lalloc 1 --atsdisable -W ${DEFAULT_TIME} scripts/gitlab/build_and_test.sh
+  artifacts:
+    reports:
+      junit: junit.xml
+
+.build_blueos_3_ppc64le_ib_p9_script:
+  extends: .build_blueos_3_ppc64le_ib_script
+
+# This is where jobs are included.
+include:
+  - local: .gitlab/ruby-templates.yml
+  - local: .gitlab/ruby-jobs.yml
+  - local: .gitlab/lassen-templates.yml
+  - local: .gitlab/lassen-jobs.yml
+  - local: .gitlab/corona-templates.yml
+  - local: .gitlab/corona-jobs.yml
diff --git a/.gitlab/corona-jobs.yml b/.gitlab/corona-jobs.yml
new file mode 100644
index 000000000..4b9428f3a
--- /dev/null
+++ b/.gitlab/corona-jobs.yml
@@ -0,0 +1,16 @@
+#############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+#############################################################################
+
+hip_5.1.0_clang_13_0_0 (build and test on corona):
+  variables:
+    SPEC: "+rocm~openmp amdgpu_target=gfx906 %clang@13.0.0 ^blt@develop ^hip@5.1.0"
+  extends: .build_and_test_on_corona
+
+#hip_5.1.0_clang_13_0_0_desul_atomics (build and test on corona):
+#  variables:
+#    SPEC: "+rocm~openmp +desul amdgpu_target=gfx906 %clang@13.0.0 ^blt@develop ^hip@5.1.0"
+#  extends: .build_and_test_on_corona
diff --git a/.gitlab/corona-templates.yml b/.gitlab/corona-templates.yml
new file mode 100644
index 000000000..4e1a5cb74
--- /dev/null
+++ b/.gitlab/corona-templates.yml
@@ -0,0 +1,33 @@
+#############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+#############################################################################
+
+####
+# This is the share configuration of jobs for corona
+
+####
+# In pre-build phase, allocate a node for builds
+.on_corona:
+  tags:
+    - shell
+    - corona
+  rules:
+    - if: '$ON_CORONA == "OFF"' #run except if ...
+      when: never
+    - if: '$CI_JOB_NAME =~ /release_resources/'
+      when: always
+    - when: on_success
+
+####
+# Generic corona build job, extending build script
+.build_and_test_on_corona:
+  stage: c_build_and_test
+  extends: [.build_toss_4_x86_64_ib_corona_script, .on_corona]
+  needs: []
+
+.build_and_test_on_corona_advanced:
+  extends: [.build_and_test_on_corona, .advanced_pipeline]
+
diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml
new file mode 100644
index 000000000..034de13eb
--- /dev/null
+++ b/.gitlab/lassen-jobs.yml
@@ -0,0 +1,98 @@
+##############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+##############################################################################
+
+##########
+# CPU ONLY
+##########
+
+#ibm_clang_10_0_1:
+#  variables:
+#    SPEC: "%clang@ibm.10.0.1"
+#  extends: .build_and_test_on_lassen
+
+clang_11_0_0:
+  variables:
+    SPEC: "+openmp %clang@11.0.0"
+  extends: .build_and_test_on_lassen
+
+#ibm_clang_9_gcc_8:
+#  variables:
+#    SPEC: "%clang@ibm.9.0.0 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1"
+#  extends: .build_and_test_on_lassen
+
+gcc_8_3_1:
+  variables:
+    SPEC: "+openmp %gcc@8.3.1 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000'"
+  extends: .build_and_test_on_lassen
+
+xl_16_1_1_12:
+  variables:
+    SPEC: "+openmp %xl@16.1.1.12 cxxflags='-qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036'"
+    DEFAULT_TIME: 50
+  extends: .build_and_test_on_lassen
+
+xl_16_1_1_12_gcc_8_3_1:
+  variables:
+    SPEC: "+openmp %xl@16.1.1.12 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1"
+    DEFAULT_TIME: 50
+  extends: .build_and_test_on_lassen
+
+##########
+# CUDA
+##########
+
+#ibm_clang_9_cuda:
+#  variables:
+#    SPEC: "+cuda cuda_arch=70 %clang@ibm.9.0.0 ^cuda@10.1.168"
+#  extends: .build_and_test_on_lassen
+
+clang_11_cuda:
+  variables:
+    SPEC: "+openmp +cuda cuda_arch=70 %clang@11.0.0 ^cuda@10.1.168"
+  extends: .build_and_test_on_lassen
+
+gcc_8_3_1_cuda:
+  variables:
+    SPEC: "+openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168"
+  extends: .build_and_test_on_lassen
+
+gcc_8_3_1_cuda_ats_disabled:
+  variables:
+    SPEC: "+openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168"
+  extends: .build_and_test_on_lassen_ats_disabled
+
+xl_16_1_1_12_cuda:
+  variables:
+    SPEC: "+openmp +cuda %xl@16.1.1.12 cxxflags='-qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036' cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5"
+    DEFAULT_TIME: 60
+  extends: .build_and_test_on_lassen
+
+xl_16_1_1_12_gcc_8_3_1_cuda_11:
+  variables:
+    SPEC: "+openmp +cuda %xl@16.1.1.12 cuda_arch=70 cxxflags'=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@11.0.2 ^cmake@3.14.5"
+    DEFAULT_TIME: 60
+  extends: .build_and_test_on_lassen
+
+##########
+# EXTRAS
+##########
+
+clang_9_0_0_libcpp (build and test on lassen):
+  variables:
+    SPEC: "+openmp %clang@9.0.0+libcpp"
+  extends: .build_and_test_on_lassen
+
+clang_9_0_0_memleak (build and test on lassen):
+  variables:
+    SPEC: "+openmp %clang@9.0.0 cxxflags=-fsanitize=address"
+    ASAN_OPTIONS: "detect_leaks=1"
+  extends: .build_and_test_on_lassen
+
+#gcc_8_3_1_cuda_desul_atomics:
+#  variables:
+#    SPEC: "+cuda +desul %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168"
+#  extends: .build_and_test_on_lassen
diff --git a/.gitlab/lassen-templates.yml b/.gitlab/lassen-templates.yml
new file mode 100644
index 000000000..dbc340f22
--- /dev/null
+++ b/.gitlab/lassen-templates.yml
@@ -0,0 +1,34 @@
+##############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+##############################################################################
+
+####
+# Shared configuration of jobs for lassen
+.on_lassen:
+  variables:
+  tags:
+    - shell
+    - lassen
+  rules:
+    - if: '$CI_COMMIT_BRANCH =~ /_lnone/ || $ON_LASSEN == "OFF"' #run except if ...
+      when: never
+    - when: on_success
+
+.build_and_test_on_lassen:
+  stage: l_build_and_test
+  extends: [.build_blueos_3_ppc64le_ib_p9_script, .on_lassen]
+  needs: []
+
+.build_and_test_on_lassen_ats_disabled:
+  stage: l_build_and_test
+  extends: [.build_blueos_3_ppc64le_ib_ats_disabled_script, .on_lassen]
+  needs: []
+
+# Note: .build_and_test_on_lassen_advanced inherits from
+# .build_and_test_on_lassen and .advanced_pileline.
+# In particular, the rules section will be merged. Careful when changing rules.
+.build_and_test_on_lassen_advanced:
+  extends: [.build_and_test_on_lassen, .advanced_pipeline]
diff --git a/.gitlab/ruby-jobs.yml b/.gitlab/ruby-jobs.yml
new file mode 100644
index 000000000..cb4214c6a
--- /dev/null
+++ b/.gitlab/ruby-jobs.yml
@@ -0,0 +1,53 @@
+##############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+##############################################################################
+
+clang_10:
+  variables:
+    SPEC: "+openmp %clang@10.0.1"
+  extends: .build_and_test_on_ruby
+
+clang_9:
+  variables:
+    SPEC: "+openmp %clang@9.0.0"
+  extends: .build_and_test_on_ruby
+
+gcc_8_1_0:
+  variables:
+    SPEC: "+openmp %gcc@8.1.0"
+    DEFAULT_TIME: 60
+  extends: .build_and_test_on_ruby
+
+#icpc_17_0_2:
+#  variables:
+#    SPEC: "%intel@17.0.2"
+#    DEFAULT_TIME: 40
+#  extends: .build_and_test_on_ruby
+
+#icpc_18_0_2:
+#  variables:
+#    SPEC: " tests=none %intel@18.0.2"
+#    DEFAULT_TIME: 40
+#  extends: .build_and_test_on_ruby
+
+icpc_19_1_0:
+  variables:
+    SPEC: "+openmp %intel@19.1.0"
+    DEFAULT_TIME: 40
+  extends: .build_and_test_on_ruby
+
+# EXTRAS
+
+#gcc_4_9_3:
+#  variables:
+#    SPEC: "%gcc@4.9.3"
+#    DEFAULT_TIME: 60
+#  extends: .build_and_test_on_ruby
+
+#clang_10_desul_atomics:
+#  variables:
+#    SPEC: "+openmp +desul %clang@10.0.1 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1"
+#  extends: .build_and_test_on_ruby
diff --git a/.gitlab/ruby-templates.yml b/.gitlab/ruby-templates.yml
new file mode 100644
index 000000000..b1314534b
--- /dev/null
+++ b/.gitlab/ruby-templates.yml
@@ -0,0 +1,54 @@
+##############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+##############################################################################
+
+####
+# This is the shared configuration of jobs for ruby
+
+####
+# In pre-build phase, allocate a node for builds
+.on_ruby:
+  tags:
+    - shell
+    - ruby
+  rules:
+    - if: '$CI_COMMIT_BRANCH =~ /_qnone/ || $ON_RUBY == "OFF"' #run except if ...
+      when: never
+    - if: '$CI_JOB_NAME =~ /release_resources/'
+      when: always
+    - when: on_success
+
+####
+# In pre-build phase, allocate a node for builds
+# NOTE: Not specifying 'salloc -c 56' should allocate the max number of CPU cores
+allocate_resources (on ruby):
+  variables:
+    GIT_STRATEGY: none
+  extends: .on_ruby
+  stage: r_allocate_resources
+  script:
+    - salloc -N 1 -p pdebug -t 45 --no-shell --job-name=${ALLOC_NAME}
+
+####
+# In post-build phase, deallocate resources
+# Note : make sure this is run even on build phase failure
+release_resources (on ruby):
+  variables:
+    GIT_STRATEGY: none
+  extends: .on_ruby
+  stage: r_release_resources
+  script:
+    - export JOBID=$(squeue -h --name=${ALLOC_NAME} --format=%A)
+    - ([[ -n "${JOBID}" ]] && scancel ${JOBID})
+
+####
+# Generic ruby build job, extending build script
+.build_and_test_on_ruby:
+  extends: [.build_toss_3_x86_64_ib_script, .on_ruby]
+  stage: r_build_and_test
+
+.build_and_test_on_ruby_advanced:
+  extends: [.build_and_test_on_ruby, .advanced_pipeline]
diff --git a/.uberenv_config.json b/.uberenv_config.json
new file mode 100644
index 000000000..6b9b7ca68
--- /dev/null
+++ b/.uberenv_config.json
@@ -0,0 +1,12 @@
+{
+"package_name" : "raja_perf",
+"package_version" : "develop",
+"package_final_phase" : "hostconfig",
+"package_source_dir" : "../..",
+"spack_url": "https://github.com/davidbeckingsale/spack",
+"spack_branch": "feature/allow-untested-cuda-versions",
+"spack_commit": "46b22d0f6227f6b12bab712bda5b916a53cfc67d",
+"spack_activate" : {},
+"spack_configs_path": "tpl/RAJA/scripts/radiuss-spack-configs",
+"spack_packages_path": "scripts/spack_packages"
+}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4fc8c256a..813d1e9b0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,51 +1,74 @@
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
-# and RAJA Performance Suite project contributors. 
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+# and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-project(RAJAPerfSuite CXX)
+# C is required for googletest to find Threads
+project(RAJAPerfSuite LANGUAGES CXX C)
 
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.14.5)
 
 option(ENABLE_RAJA_SEQUENTIAL "Run sequential variants of RAJA kernels. Disable
 this, and all other variants, to run _only_ raw C loops." On)
 
 #
-# Initialize the BLT build system
+# Note: the BLT build system is inheritted by RAJA and is initialized by RAJA
 #
 
 if (PERFSUITE_ENABLE_WARNINGS)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror")
 endif()
 
-set(ENABLE_TESTS Off CACHE BOOL "Enable BLT and RAJA tests")
-set(ENABLE_EXAMPLES Off CACHE BOOL "Enable RAJA examples")
-set(RAJA_ENABLE_EXERCISES Off CACHE BOOL "Enable RAJA exercises")
-
-set(CMAKE_CXX_STANDARD 11)
-set(BLT_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 14)
+set(BLT_CXX_STD c++14)
 
 include(blt/SetupBLT.cmake)
 
+#
+# Define RAJA PERFSUITE settings...
+#
+
+cmake_dependent_option(RAJA_PERFSUITE_ENABLE_TESTS "Enable RAJA Perf Suite Tests" On "ENABLE_TESTS" Off)
+
+if (ENABLE_TESTS)
+
+  set(RAJA_ENABLE_TESTS Off CACHE BOOL "")
+  set(CAMP_ENABLE_TESTS Off CACHE BOOL "")
+
+endif()
+
+cmake_dependent_option(RAJA_PERFSUITE_ENABLE_MPI "Build with MPI" On "ENABLE_MPI" Off)
+
+cmake_dependent_option(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN "Build OpenMP scan variants" Off "ENABLE_OPENMP" Off)
+
 #
 # Define RAJA settings...
 #
 
-set(ENABLE_TESTS Off CACHE BOOL "")
-set(ENABLE_EXAMPLES Off CACHE BOOL "")
+set(RAJA_ENABLE_TESTS Off CACHE BOOL "")
+set(RAJA_ENABLE_EXAMPLES Off CACHE BOOL "")
+set(RAJA_ENABLE_EXERCISES Off CACHE BOOL "")
 set(ENABLE_DOCUMENTATION Off CACHE BOOL "")
 
 set(ENABLE_TBB Off CACHE BOOL "")
 
 set(RAJA_USE_CHRONO On CACHE BOOL "")
 
+set(RAJA_PERFSUITE_GPU_BLOCKSIZES "" CACHE STRING "Comma separated list of GPU block sizes, ex '256,1024'")
+
 set(RAJA_RANGE_ALIGN 4)
 set(RAJA_RANGE_MIN_LENGTH 32)
 set(RAJA_DATA_ALIGN 64)
 
+string(LENGTH "${RAJA_PERFSUITE_GPU_BLOCKSIZES}" BLOCKSIZES_LENGTH)
+if (BLOCKSIZES_LENGTH GREATER 0)
+  message(STATUS "Using gpu block size(s): ${RAJA_PERFSUITE_GPU_BLOCKSIZES}")
+else()
+  message(STATUS "Using default gpu block size(s)")
+endif()
 
 # exclude RAJA make targets from top-level build...
 add_subdirectory(tpl/RAJA)
@@ -53,15 +76,8 @@ add_subdirectory(tpl/RAJA)
 get_property(RAJA_INCLUDE_DIRS DIRECTORY tpl/RAJA PROPERTY INCLUDE_DIRECTORIES)
 include_directories(${RAJA_INCLUDE_DIRS})
 
+set(CAMP_ENABLE_TESTS Off CACHE BOOL "")
 
-#
-# Setup variables to pass to Perf suite
-#
-
-#
-# These (hopefully temporary) macro constants are needed to work-around
-# performance issues in the xl compiler.
-#
 if (ENABLE_RAJA_SEQUENTIAL)
   add_definitions(-DRUN_RAJA_SEQ)
 endif ()
@@ -75,21 +91,29 @@ set(RAJA_PERFSUITE_VERSION_PATCHLEVEL 0)
 
 set(RAJA_PERFSUITE_DEPENDS RAJA)
 
+if (RAJA_PERFSUITE_ENABLE_MPI)
+  list(APPEND RAJA_PERFSUITE_DEPENDS mpi)
+endif()
 if (ENABLE_OPENMP)
   list(APPEND RAJA_PERFSUITE_DEPENDS openmp)
 endif()
 if (ENABLE_CUDA)
   list(APPEND RAJA_PERFSUITE_DEPENDS cuda)
-endif() 
+endif()
 if (ENABLE_HIP)
-  list(APPEND RAJA_PERFSUITE_DEPENDS hip)
+  message(STATUS "HIP version: ${hip_VERSION}")
+  if("${hip_VERSION}" VERSION_LESS "3.5")
+    message(FATAL_ERROR "Trying to use HIP/ROCm version ${hip_VERSION}. RAJA Perf Suite requires HIP/ROCm version 3.5 or newer. ")
+  endif()
+  list(APPEND RAJA_PERFSUITE_DEPENDS blt::hip)
+  list(APPEND RAJA_PERFSUITE_DEPENDS blt::hip_runtime)
 endif()
 
 set(RAJAPERF_BUILD_SYSTYPE $ENV{SYS_TYPE})
 set(RAJAPERF_BUILD_HOST $ENV{HOSTNAME})
 
 if (ENABLE_CUDA)
-  set(CMAKE_CUDA_STANDARD 11)
+  set(CMAKE_CUDA_STANDARD 14)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -restrict -arch ${CUDA_ARCH} --expt-extended-lambda --expt-relaxed-constexpr")
 
   set(RAJAPERF_COMPILER "${CUDA_NVCC_EXECUTABLE}")
@@ -107,9 +131,11 @@ else()
 endif()
 
 configure_file(${CMAKE_SOURCE_DIR}/src/rajaperf_config.hpp.in
-  ${CMAKE_CURRENT_BINARY_DIR}/bin/rajaperf_config.hpp)
+  ${CMAKE_CURRENT_BINARY_DIR}/include/rajaperf_config.hpp)
 
-# Make sure RAJA flag propagate (we need to do some house cleaning to 
+include_directories($<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>)
+
+# Make sure RAJA flag propagate (we need to do some house cleaning to
 # remove project-specific CMake variables that are no longer needed)
 set (CUDA_NVCC_FLAGS ${RAJA_NVCC_FLAGS})
 
@@ -117,3 +143,7 @@ set (CUDA_NVCC_FLAGS ${RAJA_NVCC_FLAGS})
 # Each directory in the perf suite has its own CMakeLists.txt file.
 #
 add_subdirectory(src)
+
+if (RAJA_PERFSUITE_ENABLE_TESTS)
+  add_subdirectory(test)
+endif()
diff --git a/Dockerfile b/Dockerfile
index 220b3cb4b..037f59b60 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,90 +1,106 @@
-###############################################################################
-# Copyright (c) 2016-21, Lawrence Livermore National Security, LLC
-# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+##############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-FROM axom/compilers:gcc-5 AS gcc5
+FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-7.3.0 AS gcc7
+ENV GTEST_COLOR=1
+COPY . /home/raja/workspace
+WORKDIR /home/raja/workspace/build
+RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On .. && \
+    make -j 6 &&\
+    ctest -T test --output-on-failure
+
+FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-8.1.0 AS gcc8
 ENV GTEST_COLOR=1
-COPY --chown=axom:axom . /home/axom/workspace
-WORKDIR /home/axom/workspace
-RUN ls
-RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_WARNINGS=On -DENABLE_OPENMP=On -DRAJA_DEPRECATED_TESTS=On ..
-RUN cd build && make -j 16
-RUN cd build && ./bin/raja-perf.exe
+COPY . /home/raja/workspace
+WORKDIR /home/raja/workspace/build
+RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_OPENMP=On .. && \
+    make -j 6 &&\
+    ctest -T test --output-on-failure
 
-FROM axom/compilers:gcc-5 AS gcc5-debug
+FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-9.4.0 AS gcc9
 ENV GTEST_COLOR=1
-COPY --chown=axom:axom . /home/axom/workspace
-WORKDIR /home/axom/workspace
-RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Debug -DENABLE_WARNINGS=On -DENABLE_COVERAGE=On -DENABLE_OPENMP=On ..
-RUN cd build && make -j 16
+COPY . /home/raja/workspace
+WORKDIR /home/raja/workspace/build
+RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On .. && \
+    make -j 6 &&\
+    ctest -T test --output-on-failure
 
-FROM axom/compilers:gcc-6 AS gcc6
+FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-11.2.0 AS gcc11
 ENV GTEST_COLOR=1
-COPY --chown=axom:axom . /home/axom/workspace
-WORKDIR /home/axom/workspace
-RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_WARNINGS=On -DENABLE_OPENMP=On -DRAJA_ENABLE_RUNTIME_PLUGINS=On ..
-RUN cd build && make -j 16
-RUN cd build && ./bin/raja-perf.exe
+COPY . /home/raja/workspace
+WORKDIR /home/raja/workspace/build
+RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On .. && \
+    make -j 6 &&\
+    ctest -T test --output-on-failure
 
-FROM axom/compilers:gcc-7 AS gcc7
+FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11
 ENV GTEST_COLOR=1
-COPY --chown=axom:axom . /home/axom/workspace
-WORKDIR /home/axom/workspace
-RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_WARNINGS=On -DENABLE_OPENMP=On ..
-RUN cd build && make -j 16
-RUN cd build && ./bin/raja-perf.exe
+COPY . /home/raja/workspace
+WORKDIR /home/raja/workspace/build
+RUN . /opt/spack/share/spack/setup-env.sh && spack load llvm && \
+    cmake -DCMAKE_CXX_COMPILER=clang++ -DENABLE_OPENMP=On .. && \
+    make -j 6 &&\
+    ctest -T test --output-on-failure
 
-FROM axom/compilers:gcc-8 AS gcc8
+FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11-debug
 ENV GTEST_COLOR=1
-COPY --chown=axom:axom . /home/axom/workspace
-WORKDIR /home/axom/workspace
-RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_WARNINGS=On -DENABLE_OPENMP=On -DRAJA_ENABLE_BOUNDS_CHECK=ON ..
-RUN cd build && make -j 16
-RUN cd build && ./bin/raja-perf.exe
+COPY . /home/raja/workspace
+WORKDIR /home/raja/workspace/build
+RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug .. && \
+    make -j 6 &&\
+    ctest -T test --output-on-failure
 
-FROM axom/compilers:clang-9 AS clang9
+FROM ghcr.io/rse-ops/clang-ubuntu-22.04:llvm-13.0.0 AS clang13
 ENV GTEST_COLOR=1
-COPY --chown=axom:axom . /home/axom/workspace
-WORKDIR /home/axom/workspace
-RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS=-fmodules -DENABLE_OPENMP=On ..
-RUN cd build && make -j 16
-RUN cd build && ./bin/raja-perf.exe
+COPY . /home/raja/workspace
+WORKDIR /home/raja/workspace/build
+RUN . /opt/spack/share/spack/setup-env.sh && spack load llvm && \
+    cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=On .. && \
+    make -j 6 &&\
+    ctest -T test --output-on-failure
 
-FROM axom/compilers:clang-9 AS clang9-debug
+FROM ghcr.io/rse-ops/cuda:cuda-10.1.243-ubuntu-18.04 AS nvcc10
 ENV GTEST_COLOR=1
-COPY --chown=axom:axom . /home/axom/workspace
-WORKDIR /home/axom/workspace
-RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug -DENABLE_OPENMP=On -DCMAKE_CXX_FLAGS=-fsanitize=address ..
-RUN cd build && make -j 16
+COPY . /home/raja/workspace
+WORKDIR /home/raja/workspace/build
+RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \
+    cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \
+    make -j 4
 
-FROM axom/compilers:nvcc-10.2 AS nvcc10
+FROM ghcr.io/rse-ops/cuda-ubuntu-20.04:cuda-11.1.1 AS nvcc11
 ENV GTEST_COLOR=1
-COPY --chown=axom:axom . /home/axom/workspace
-WORKDIR /home/axom/workspace
-RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 ..
-RUN cd build && make -j 2
+COPY . /home/raja/workspace
+WORKDIR /home/raja/workspace/build
+RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \
+    cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \
+    make -j 4
 
-FROM axom/compilers:nvcc-10.2 AS nvcc10-debug
+FROM ghcr.io/rse-ops/cuda-ubuntu-20.04:cuda-11.1.1 AS nvcc11-debug
 ENV GTEST_COLOR=1
-COPY --chown=axom:axom . /home/axom/workspace
-WORKDIR /home/axom/workspace
-RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Debug -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 ..
-RUN cd build && make -j 2
+COPY . /home/raja/workspace
+WORKDIR /home/raja/workspace/build
+RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \
+    cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \
+    make -j 4
 
-FROM axom/compilers:rocm AS hip
+FROM ghcr.io/rse-ops/hip-ubuntu-20.04:hip-4.3.1 AS hip
 ENV GTEST_COLOR=1
-COPY --chown=axom:axom . /home/axom/workspace
-WORKDIR /home/axom/workspace
 ENV HCC_AMDGPU_TARGET=gfx900
-RUN mkdir build && cd build && cmake -DROCM_ROOT_DIR=/opt/rocm/include -DHIP_RUNTIME_INCLUDE_DIRS="/opt/rocm/include;/opt/rocm/hip/include" -DENABLE_HIP=On -DENABLE_OPENMP=Off -DENABLE_CUDA=Off -DENABLE_WARNINGS_AS_ERRORS=Off -DHIP_HIPCC_FLAGS=-fPIC ..
-RUN cd build && make -j 16
+COPY . /home/raja/workspace
+WORKDIR /home/raja/workspace/build
+RUN . /opt/spack/share/spack/setup-env.sh && spack load hip llvm-amdgpu && \
+    cmake -DCMAKE_CXX_COMPILER=amdclang++ -DRAJA_ENABLE_EXTERNAL_ROCPRIM=Off -DHIP_PATH=/opt -DENABLE_HIP=On -DENABLE_CUDA=Off -DENABLE_OPENMP=Off -DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off -DBLT_EXPORT_THIRDPARTY=On .. && \
+    make -j 6
 
-FROM axom/compilers:oneapi AS sycl
+FROM ghcr.io/rse-ops/intel-ubuntu-22.04:intel-2022.0.1 AS sycl
 ENV GTEST_COLOR=1
-COPY --chown=axom:axom . /home/axom/workspace
-WORKDIR /home/axom/workspace
-RUN /bin/bash -c "source /opt/intel/inteloneapi/setvars.sh && mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=dpcpp -DENABLE_SYCL=On .."
-RUN /bin/bash -c "source /opt/intel/inteloneapi/setvars.sh && cd build && make -j 16"
+COPY . /home/raja/workspace
+WORKDIR /home/raja/workspace/build
+RUN /bin/bash -c "source /opt/view/setvars.sh && \
+    cmake -DCMAKE_CXX_COMPILER=dpcpp -DRAJA_ENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 .. && \
+    make -j 6 &&\
+    ./bin/raja-perf.exe --checkrun 5 -sp"
diff --git a/LICENSE b/LICENSE
index f08c6273a..8e4df6528 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2017-2021, Lawrence Livermore National Security, LLC.
+Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/README.md b/README.md
index 5b1de76f9..a85ad0f5f 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 [comment]: # (#################################################################)
-[comment]: # (Copyright 2017-2021, Lawrence Livermore National Security, LLC)
+[comment]: # (Copyright 2017-2022, Lawrence Livermore National Security, LLC)
 [comment]: # (and RAJA Performance Suite project contributors.)
-[comment]: # (See the RAJA/LICENSE file for details.)
+[comment]: # (See the RAJAPerf/LICENSE file for details.)
 [comment]: #
 [comment]: # (# SPDX-License-Identifier: BSD-3-Clause)
 [comment]: # (#################################################################)
@@ -12,28 +12,40 @@ RAJA Performance Suite
 
 [![Build Status](https://travis-ci.org/LLNL/RAJAPerf.svg?branch=develop)](https://travis-ci.org/LLNL/RAJAPerf)
 
-The RAJA Performance Suite is designed to explore performance of loop-based 
+The RAJA Performance Suite is designed to explore performance of loop-based
 computational kernels found in HPC applications. Specifically, it can be
-used to assess and monitor runtime performance of kernels implemented using 
-[RAJA] C++ performance portability abstractions and compare those to variants 
-implemented using common parallel programming models, such as OpenMP and CUDA, 
+used to assess and monitor runtime performance of kernels implemented using
+[RAJA] C++ performance portability abstractions and compare those to variants
+implemented using common parallel programming models, such as OpenMP and CUDA,
 directly. Some important terminology used in the Suite includes:
 
   * `Kernel` is a distinct loop-based computation that appears in the Suite in
-    multiple variants (or implementations), each of which performs the same 
+    multiple variants (or implementations), each of which performs the same
     computation.
-  * `Variant` is a particular implementation of a kernel in the Suite, 
+  * `Variant` is an implementation or set of implementations of a kernel in the
+    Suite that share the same approach/abstraction and programming model,
     such as baseline OpenMP, RAJA OpenMP, etc.
+  * `Tuning` is a particular implementation of a variant of a kernel in the
+    Suite, such as gpu block size 128, gpu block size 256, etc.
   * `Group` is a collection of kernels in the Suite that are grouped together
-    because they originate from the same source, such as a specific benchmark 
+    because they originate from the same source, such as a specific benchmark
     suite.
 
 Each kernel in the Suite appears in multiple RAJA and non-RAJA (i.e., baseline)
-variants using parallel programming models that RAJA supports. The kernels 
-originate from various HPC benchmark suites and applications. For example,
-the "Stream" group contains kernels from the Babel Stream benchmark, the "Apps"
-group contains kernels extracted from real scientific computing applications,
-and so forth.
+variants using parallel programming models that RAJA supports. Some kernels have
+multiple tunings of a variant to explore some of the parametrization that the
+programming model supports. The kernels originate from various HPC benchmark
+suites and applications. For example, the "Stream" group contains kernels from
+the Babel Stream benchmark, the "Apps" group contains kernels extracted from
+real scientific computing applications, and so forth.
+
+The suite can be run as a single process or with multiple processes when
+configured with MPI support. Running with MPI in the same configuration used
+by an hpc app allows the suite to gather performance data that is more relevant
+for that hpc app than performance data gathered running single process. For
+example running sequentially with one MPI rank per core vs running sequentially
+with a single process yields different performance results on most multi-core
+CPUs.
 
 * * *
 
@@ -44,6 +56,7 @@ Table of Contents
 2. [Running the Suite](#running-the-suite)
 3. [Generated output](#generated-output)
 4. [Adding kernels and variants](#adding-kernels-and-variants)
+4. [Continuous Integration](#continuous-integration)
 5. [Contributions](#contributions)
 6. [Authors](#authors)
 7. [Copyright and Release](#copyright-and-release)
@@ -52,7 +65,7 @@ Table of Contents
 
 # Building the Suite
 
-To build the Suite, you must first obtain a copy of the source code by cloning 
+To build the Suite, you must first obtain a copy of the source code by cloning
 the GitHub repository. For example,
 
 ```
@@ -61,13 +74,13 @@ the GitHub repository. For example,
 > git clone --recursive https://github.com/llnl/RAJAPerf.git
 ```
 
-The repository will reside in a `RAJAPerf` sub-directory in the directory into 
+The repository will reside in a `RAJAPerf` sub-directory in the directory into
 which is was cloned.
 
-The Performance Suite has two Git submodules, [RAJA] and the CMake-based [BLT] 
+The Performance Suite has two Git submodules, [RAJA] and the CMake-based [BLT]
 build system. The `--recursive` option tells Git to clone the submodules
 as well as any submodules that they use. If you switch to a different branch
-in your working copy of the repository, you should update the submodules to 
+in your working copy of the repository, you should update the submodules to
 make sure you have the right versions of them for the branch. For example,
 
 ```
@@ -76,18 +89,18 @@ make sure you have the right versions of them for the branch. For example,
 > git submodule update --recursive
 ```
 
-Note that the `--recursive` option will update submodules within submodules, 
+Note that the `--recursive` option will update submodules within submodules,
 similar to usage with the `git clone` as described above.
 
 RAJA and the Performance Suite are built together using the same CMake
 configuration. For convenience, we include scripts in the `scripts`
-directory that invoke corresponding configuration files (CMake cache files) 
-in the RAJA submodule. For example, the `scripts/lc-builds` directory 
+directory that invoke corresponding configuration files (CMake cache files)
+in the RAJA submodule. For example, the `scripts/lc-builds` directory
 contains scripts that show how we build code for testing on platforms in
-the Lawrence Livermore Computing Center. Each build script creates a 
-descriptively-named build space directory in the top-level Performance Suite 
-directory and runs CMake with a configuration appropriate for the platform and 
-compilers used. After CMake completes, enter the build directory and type 
+the Lawrence Livermore Computing Center. Each build script creates a
+descriptively-named build space directory in the top-level Performance Suite
+directory and runs CMake with a configuration appropriate for the platform and
+compilers used. After CMake completes, enter the build directory and type
 `make` (or `make -j <N>` for a parallel build using N processor cores; if you
 omit the number of cores, the code will build in parallel using all available
 cores on the node you are running on) to compile the code. For example,
@@ -98,7 +111,7 @@ cores on the node you are running on) to compile the code. For example,
 > make -j
 ```
 
-The build scripts and associated CMake `host-config` files in RAJA are 
+The build scripts and associated CMake `host-config` files in RAJA are
 useful sources of information for building the Suite on various platforms.
 For example, they show how to enable specific back-end kernel variants and
 compiler options we use for testing.
@@ -115,42 +128,88 @@ options from there. For example, :
 
 The provided configurations will only build the Performance Suite code by
 default; i.e., it will not build any RAJA test or example codes. If you
-want to build the RAJA tests, for example, to verify your build of RAJA is 
+want to build the RAJA tests, for example, to verify your build of RAJA is
 working properly, just pass the `-DENABLE_TESTS=On` option to CMake, either
-on the command line if you run CMake directly or edit the script you are 
+on the command line if you run CMake directly or edit the script you are
 running to do this. Then, when the build completes, you can type `make test`
 to run the RAJA tests.
 
+## Building with MPI
+
+Some of the provided configurations will build the Performance Suite with
+MPI support enabled. For example,
+
+```
+> ./scripts/blueos_spectrum_nvcc_clang.sh rolling-release 10.2.89 sm_70 10.0.1
+> cd build_lc_blueos-spectrumrolling-release-nvcc10.2.89-sm_70-clang10.0.1
+> make -j
+```
+
+In general MPI support can be enabled by passing the `-DENABLE_MPI=On` option
+to CMake and providing a mpi compiler wrapper via the
+`-DMPI_CXX_COMPILER=/path/to/mpic++` option to CMake in addition to other CMake
+options. For example,
+
+```
+> mkdir my-mpi-build
+> cd my-mpi-build
+> cmake -DENABLE_MPI=On -DMPI_CXX_COMPILER=/path/to/mpic++ <cmake args> ../
+> make -j
+```
+
+## Building with specific GPU block size tunings
+
+Some of the provided configurations will build the Performance Suite with
+GPU support enabled. This will build with the default GPU block size tuning for
+each kernel. For example,
+
+```
+> ./scripts/blueos_nvcc_clang.sh 10.2.89 sm_70 10.0.1
+> cd build_lc_blueos-nvcc10.2.89-sm_70-clang10.0.1
+> make -j
+```
+
+Using a specific set of GPU block sizes is done by by passing the
+`-DRAJA_PERFSUITE_GPU_BLOCKSIZES=<list,of,block,sizes>` option to CMake via the
+`-DMPI_CXX_COMPILER=/path/to/mpic++` option to CMake in addition to other CMake
+options. For example,
+
+```
+> mkdir my-gpu-build
+> cd my-gpu-build
+> cmake -DRAJA_PERFSUITE_GPU_BLOCKSIZES=64,128,256,512,1024 <cmake args> ../
+> make -j
+```
 
 * * *
 
 # Running the Suite
 
-The Suite is run by invoking the executable in the `bin` sub-directory in the 
+The Suite is run by invoking the executable in the `bin` sub-directory in the
 build space directory. For example, if you provide no command line options,
 
 ```
 > ./bin/raja-perf.exe
 ```
 
-the entire Suite (all kernels and variants) will execute in their default 
-configurations. How the Suite will run and some details about each kernel 
+the entire Suite (all kernels and variants) will execute in their default
+configurations. How the Suite will run and some details about each kernel
 will appear on the screen before it is run. Kernel detail information will
 also appear in a run report file generated in your run directory
-after the Suite executes. You can pass the ''--dryrun'' option along with 
+after the Suite executes. You can pass the ''--dryrun'' option along with
 any other runtime options to see a summary of how the Suite will execute
 without actually running it.
 
 The Suite can be run in a variety of ways via options passed to the executable.
-For example, you can run subsets of kernels and variants by specifying 
-variants, groups, or individual kernels explicitly. Other configuration 
-options to set problem sizes, number of times each kernel is run, etc. can 
-also be specified. You build the code once and use scripts or other mechanisms 
+For example, you can run subsets of kernels and variants by specifying
+variants, groups, or individual kernels explicitly. Other configuration
+options to set problem sizes, number of times each kernel is run, etc. can
+also be specified. You build the code once and use scripts or other mechanisms
 to run the Suite in different ways for analyses you want to perform.
 
 All options appear in a 'long form' with a double hyphen prefix (i.e., '--').
-Some options are available in a one or two character 'short form' with a 
-single hyphen prefix (i.e., '-') for convenience. To see available options 
+Some options are available in a one or two character 'short form' with a
+single hyphen prefix (i.e., '-') for convenience. To see available options
 along with a brief description of each, pass the `--help` or `-h` option:
 
 ```
@@ -163,31 +222,47 @@ or
 > ./bin/raja-perf.exe -h
 ```
 
-Lastly, the program will generate a summary of provided input if it is given 
+Lastly, the program will generate a summary of provided input if it is given
 input that the code does not know how to parse. Ill-formed input will be noted
-in the summary output. Hopefully, this will make it easy for users to correct 
+in the summary output. Hopefully, this will make it easy for users to correct
 erroneous usage, such as mis-spelled option names.
 
+## Running with MPI
+
+Running the Suite with MPI is as simple as running any other MPI application.
+For example,
+
+```
+> srun -n 2 ./bin/raja-perf.exe
+```
+the entire Suite (all kernels and variants) will execute in their default
+configurations on each of the 2 ranks. The kernel information output shows how
+each kernel is run on each rank. The total problem size across all MPI ranks
+can be calculated by multiplying the number of MPI ranks by the problem
+size in the kernel information. Timing is reported on rank 0 and is gathered
+by doing an MPI barrier, starting the timer, running the kernel repetitions,
+doing an MPI barrier, and then stopping the timer.
+
 ## Important note
 
- * The OpenMP target offload variants of the kernels in the Suite are a 
+ * The OpenMP target offload variants of the kernels in the Suite are a
    work-in-progress since the RAJA OpenMP target offload back-end is also
-   a work-in-progress. If you configure them to build, they can be run with 
-   the executable `./bin/raja-perf-omptarget.exe` which is distinct from 
+   a work-in-progress. If you configure them to build, they can be run with
+   the executable `./bin/raja-perf-omptarget.exe` which is distinct from
    the one described above. At the time the OpenMP target offload variants were
    developed, it was not possible for them to co-exist in the same executable
    as the CUDA variants, for example. In the future, the build system may
-   be reworked so that the OpenMP target variants can be run from the same 
+   be reworked so that the OpenMP target variants can be run from the same
    executable as the other variants.
 
 * * *
 
 # Generated output
 
-When the Suite is run, several output files are generated that contain 
-data describing the run. The file names start with the file prefix 
+When the Suite is run, several output files are generated that contain
+data describing the run. The file names start with the file prefix
 provided via a command line option in the output directory, also specified
-on the command line. If no such options are provided, files will be located 
+on the command line. If no such options are provided, files will be located
 in the current run directory and be named `RAJAPerf-*`, where '*' is a string
 indicating the contents of the file.
 
@@ -200,14 +275,14 @@ Currently, there are five files generated:
 5. Kernel -- Basic information about each kernel that is run, which is the same
 for each variant of the kernel that is run. See description of output information below.
 
-All output files are text files. Other than the checksum file, all are in 
+All output files are text files. Other than the checksum file, all are in
 'csv' format for easy processing by common tools and generating plots.
 
 ## Kernel information definitions
 
-Information about kernels that are run is located in the ''RAJAPerf-kernels.csv'' file, which includes the following:
+Information about kernels that are run is located in the ''RAJAPerf-kernels.csv'' file. This information is for each process individually, so when running with MPI the total problem size aggregated across all ranks is the number of ranks times the problem size shown in the kernel information. Kernel information includes the following:
 
-1. Kernel name -- Format is group name followed by kernel name, separated by an underscore. 
+1. Kernel name -- Format is group name followed by kernel name, separated by an underscore.
 2. Feature -- RAJA feature(s) exercised in RAJA variants of kernel.
 3. Problem size -- Size of the problem represented by a kernel. Please see notes below for more information.
 4. Reps -- Number of times a kernel runs in a single pass through the Suite.
@@ -218,38 +293,41 @@ Information about kernels that are run is located in the ''RAJAPerf-kernels.csv'
 
 ### Notes about 'problem size'
 
- * The Suite uses three notions of problem size for each kernel: 'default', 
-   'target', and 'actual'. Default is the 'default' problem size defined for a 
-   kernel and the size that will be run if no runtime options are 
+ * Problem size is always output per process/MPI rank. To get the total problem
+   size across all ranks when running with MPI multiply the problem size by
+   the number of MPI ranks.
+ * The Suite uses three notions of problem size for each kernel: 'default',
+   'target', and 'actual'. Default is the 'default' problem size defined for a
+   kernel and the size that will be run if no runtime options are
    provided to run a different size. Target is the desired problem size to run
    based on default settings and alterations to that if input is provided to
-   change the default. Actual is the problem size that is run based on how 
+   change the default. Actual is the problem size that is run based on how
    each kernel calculates this.
  * The concept of problem size is subjective and can be interpreted differently
-   depending on the kernel structure and what one is trying to measure. For 
-   example, problem size could refer to the amount of data needed to be stored 
-   in memory to run the problem, or it could refer to the amount of parallel 
-   work that is possible, etc. 
- * We employ the following, admittedly loose definition, which depends on the 
-   particular kernel structure. Of all the 'loop structures' (e.g., single 
-   loop, nested loops, etc.) that are run for a kernel (note that some kernels 
-   run multiple loops, possibly with different sizes or loop structures), 
-   problem size refers to the size of the data set required to generate the 
-   kernel result. The interpretation of this and the definition of problem 
-   size for each kernel in the suite is determined by the kernel developer 
+   depending on the kernel structure and what one is trying to measure. For
+   example, problem size could refer to the amount of data needed to be stored
+   in memory to run the problem, or it could refer to the amount of parallel
+   work that is possible, etc.
+ * We employ the following, admittedly loose definition, which depends on the
+   particular kernel structure. Of all the 'loop structures' (e.g., single
+   loop, nested loops, etc.) that are run for a kernel (note that some kernels
+   run multiple loops, possibly with different sizes or loop structures),
+   problem size refers to the size of the data set required to generate the
+   kernel result. The interpretation of this and the definition of problem
+   size for each kernel in the suite is determined by the kernel developer
    and team discussion.
 
-Here are a few examples to give a better sense of how we determine problem 
+Here are a few examples to give a better sense of how we determine problem
 size for various kernels in the Suite.
 
 Vector addition.
 ```cpp
 for (int i = 0; i < 0; i < N; ++i) {
-  c[i] = a[i] + b[i]; 
+  c[i] = a[i] + b[i];
 }
 ```
-The problem size for this kernel is 'N', the loop length. Note that this 
-happens to match the size of the vectors a, b, c and the total amount of 
+The problem size for this kernel is 'N', the loop length. Note that this
+happens to match the size of the vectors a, b, c and the total amount of
 parallel work in the kernel. This is common for simple, data parallel kernels.
 
 Matrix-vector multiplication.
@@ -267,7 +345,7 @@ work is N_r, the number of rows in the matrix and the length of the vector b.
 
 Matrix-matrix multiplication.
 ```cpp
-for (int i = 0; i < N_i; ++i) {  
+for (int i = 0; i < N_i; ++i) {
   for (int j = 0; j < N_j; ++j) {
     A[i][j] = 0;
     for (int k = 0; k < N_k; ++k) {
@@ -278,19 +356,19 @@ for (int i = 0; i < N_i; ++i) {
 ```
 Here, we are multiplying matrix B (N_i x N_k) and matrix C (N_k x N_j) and
 storing the result in matrix A (N_i X N_j). Problem size could be chosen to
-be the maximum number of entries in matrix B or C. We choose the size of 
-matrix A (N_i * N_j), which is more closely aligned with the number of 
-independent operations (i.e., the amount of parallel work) in the kernels. 
+be the maximum number of entries in matrix B or C. We choose the size of
+matrix A (N_i * N_j), which is more closely aligned with the number of
+independent operations (i.e., the amount of parallel work) in the kernels.
 
 
 * * *
 
-# Adding kernels and variants
+# Adding kernels, variants, and Tunings
 
-This section describes how to add new kernels and/or variants to the Suite.
-*Group* and *feature* modifications are not required unless a new group or
-exercised RAJA feature is added when a new kernel is introduced. The 
-information in this section also provides insight into how the performance 
+This section describes how to add new kernels, variants and/or tunings to the
+Suite. *Group* and *feature* modifications are not required unless a new group
+or exercised RAJA feature is added when a new kernel is introduced. The
+information in this section also provides insight into how the performance
 Suite operates.
 
 It is essential that the appropriate targets are updated in the appropriate
@@ -301,7 +379,7 @@ be compiled.
 
 Adding a new kernel to the Suite involves three main steps:
 
-1. Add a unique kernel ID and a unique kernel name to the Suite. 
+1. Add a unique kernel ID and a unique kernel name to the Suite.
 2. If the kernel is part of a new kernel group or exercises a new RAJA feature, also add a unique group ID and name for the group. Similarly, if a new RAJA
 feature is exercised by a new kernel.
 3. Implement a kernel class that contains all operations needed to run it, with source files organized as described below.
@@ -310,14 +388,14 @@ These steps are described in the following sections.
 
 ### Add the kernel ID and name
 
-Two key pieces of information identify a kernel: the group in which it 
+Two key pieces of information identify a kernel: the group in which it
 resides and the name of the kernel itself. For concreteness, we describe
-how to add a kernel "FOO" that lives in the kernel group "Basic". The files 
+how to add a kernel "FOO" that lives in the kernel group "Basic". The files
 `RAJAPerfSuite.hpp` and `RAJAPerfSuite.cpp` in the `src/common` directory
-define enumeration values and arrays of string names for the kernels, 
-respectively. 
+define enumeration values and arrays of string names for the kernels,
+respectively.
 
-First, add an enumeration value identifier for the kernel, that is unique 
+First, add an enumeration value identifier for the kernel, that is unique
 among all kernels, in the enum 'KernelID' in the header file `RAJAPerfSuite.hpp`:
 
 ```cpp
@@ -331,7 +409,7 @@ enum KernelID {
 Note: the enumeration value for the kernel is the group name followed
 by the kernel name, separated by an underscore. It is important to follow
 this convention so that the kernel works properly with the Performance
-Suite machinery. 
+Suite machinery.
 
 Second, add the kernel name to the array of strings `KernelNames` in the file
 `RAJAPerfSuite.cpp`:
@@ -356,7 +434,7 @@ and IDs in alphabetical order to make the organization clear.
 ### Add new group if needed
 
 If a kernel is added as part of a new group of kernels in the Suite, a
-new value must be added to the `GroupID` enum in the header file 
+new value must be added to the `GroupID` enum in the header file
 `RAJAPerfSuite.hpp` and an associated group string name must be added to
 the `GroupNames` array of strings in the file `RAJAPerfSuite.cpp`. Again,
 the enumeration values and items in the string array must be kept
@@ -367,32 +445,32 @@ Adding a new RAJA feature is similar.
 
 ### Add the kernel class
 
-Each kernel in the Suite is implemented in a class whose header and 
+Each kernel in the Suite is implemented in a class whose header and
 implementation files live in the directory named for the group
 in which the kernel lives. The kernel class is responsible for implementing
-all operations needed to manage data, execute, and record execution timing and 
-checksum information for each variant of the kernel. To properly plug in to 
-the Performance Suite framework, the kernel class must be a subclass of the
-`KernelBase` base class that defines the interface for kernels in the Suite.
+all operations needed to manage data, execute, and record execution timing and
+checksum information for each variant and tuning of the kernel. To properly plug
+in to the Performance Suite framework, the kernel class must be a subclass of
+the `KernelBase` base class that defines the interface for kernels in the Suite.
 
-Continuing with our example, we add a 'FOO' class header file `FOO.hpp`, 
-and multiple implementation files described in the following sections: 
+Continuing with our example, we add a 'FOO' class header file `FOO.hpp`,
+and multiple implementation files described in the following sections:
 
   * `FOO.cpp` contains the methods to setup and teardown the memory for the
-    'FOO' kernel, and compute and record a checksum on the result after it 
-    executes. It also specifies kernel information in the kernel class 
+    'FOO' kernel, and compute and record a checksum on the result after it
+    executes. It also specifies kernel information in the kernel class
     constructor.
-  * `FOO-Seq.cpp` contains sequential CPU variants of the kernel.
-  * `FOO-OMP.cpp` contains OpenMP CPU multithreading variants of the kernel.
-  * `FOO-OMPTarget.cpp` contains OpenMP target offload variants of the kernel.
-  * `FOO-Cuda.cpp` contains CUDA GPU variants of the kernel.
-  * `FOO-Hip.cpp` contains HIP GPU variants of the kernel.
+  * `FOO-Seq.cpp` contains sequential CPU variants and tunings of the kernel.
+  * `FOO-OMP.cpp` contains OpenMP CPU multithreading variants and tunings of the kernel.
+  * `FOO-OMPTarget.cpp` contains OpenMP target offload variants and tunings of the kernel.
+  * `FOO-Cuda.cpp` contains CUDA GPU variants and tunings of the kernel.
+  * `FOO-Hip.cpp` contains HIP GPU variants and tunings of the kernel.
 
 All kernels in the Suite follow the same implementation pattern. Inspect the
 files for any kernel to understand the overall organization.
-  
+
 Note: if a new execution back-end variant is added that is not listed here,
-that variant should go in the file `FOO-<backend-name>.cpp`. Keeping the 
+that variant should go in the file `FOO-<backend-name>.cpp`. Keeping the
 back-end variants in separate files helps to understand compiler optimizations
 when looking at generated assembly code, for example.
 
@@ -402,7 +480,7 @@ Here is what a header file for the FOO kernel object should look like:
 
 ```cpp
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -420,11 +498,11 @@ Here is what a header file for the FOO kernel object should look like:
 
 #include "common/KernelBase.hpp"
 
-namespace rajaperf  
+namespace rajaperf
 {
 class RunParams; // Forward declaration for ctor arg.
 
-namespace basic   
+namespace basic
 {
 
 class FOO : public KernelBase
@@ -435,15 +513,15 @@ public:
 
   ~FOO();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tuning_idx);
+  void updateChecksum(VariantID vid, size_t tuning_idx);
+  void tearDown(VariantID vid, size_t tuning_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid); 
+  void runSeqVariant(VariantID vid, size_t tuning_idx);
+  void runOpenMPVariant(VariantID vid, size_t tuning_idx);
+  void runCudaVariant(VariantID vid, size_t tuning_idx);
+  void runHipVariant(VariantID vid, size_t tuning_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tuning_idx);
 
 private:
   // Kernel-specific data (pointers, scalars, etc.) as needed...
@@ -456,39 +534,39 @@ private:
 ```
 
 The kernel object header has a uniquely-named header file include guard and
-the class is nested within the `rajaperf` and `basic` namespaces. The 
+the class is nested within the `rajaperf` and `basic` namespaces. The
 constructor takes a reference to a `RunParams` object, which contains the
-input parameters for running the Suite -- we'll say more about this later. 
-The methods that take a variant ID argument must be provided as they are
-pure virtual in the KernelBase class. Their names are descriptive of what they
-do and we'll provide more details about them when we describe the class 
-implementation next.
+input parameters for running the Suite -- we'll say more about this later.
+The methods that take a variant ID and tuning index arguments must be provided
+as they are pure virtual in the KernelBase class. Their names are descriptive of
+what they do and we'll provide more details about them when we describe the
+class implementation next.
 
 #### Kernel class implementation
 
-Each kernel in the Suite follows a similar implementation pattern for 
-consistency and ease of analysis and understanding. Here, we describe several 
-key steps and conventions that must be followed to ensure that all kernels 
+Each kernel in the Suite follows a similar implementation pattern for
+consistency and ease of analysis and understanding. Here, we describe several
+key steps and conventions that must be followed to ensure that all kernels
 interact with the performance Suite machinery in the same way:
 
 1. Initialize the `KernelBase` class object with `KernelID` and `RunParams` object passed to the FOO class constructor.
-2. In the class constructor, define kernel information. This includes: default problem size, default run repetition count, iterations per rep, kernels per rep, bytes per rep, FLOPs per rep, the RAJA features used by the kernel, and kernel variants defined (i.e., implemented) by calling the appropriate members in the `KernelBase`` class. See the *.cpp file for any existing kernel in the suite for examples of how this is done.
+2. In the class constructor, define kernel information. This includes: default problem size, default run repetition count, iterations per rep, kernels per rep, bytes per rep, FLOPs per rep, the RAJA features used by the kernel, and kernel variants defined (i.e., implemented) by calling the appropriate members in the `KernelBase` class. See the *.cpp file for any existing kernel in the suite for examples of how this is done. Note that tuning names are added in step 6.
 3. Implement data allocation and initialization operations for each kernel variant in the `setUp` method.
 4. Compute the checksum for each variant in the `updateChecksum` method.
 5. Deallocate and reset any data that will be allocated and/or initialized in subsequent kernel executions in the `tearDown` method.
-6. Implement kernel execution for the associated variants in the `run*Variant` methods in the proper source files.
+6. Implement kernel execution for the associated variants and tunings in the `run*Variant` methods in the proper source files. Add tuning names for the tunings of each variant by overriding the `KernelBase` methods `set*TuningDefinitions`. Note that this is not necessary if there is only one tuning.
 
 ##### Constructor and destructor
 
 It is important to note that there will only be one instance of each kernel
-class created by the program. Thus, each kernel class constructor and 
-destructor must only perform operations that are not specific to any kernel 
+class created by the program. Thus, each kernel class constructor and
+destructor must only perform operations that are not specific to any kernel
 variant.
 
 The constructor must pass the kernel ID and RunParams object to the base
 class `KernelBase` constructor. The body of the constructor must also call
-base class methods to set kernel information described above. Note that 
-the arguments passed to each method are specific to each kernel, in general. 
+base class methods to set kernel information described above. Note that
+the arguments passed to each method are specific to each kernel, in general.
 This code snippets shows a typical way this looks for a simple single for-loop
 data parallel kernel.
 
@@ -501,11 +579,11 @@ FOO::FOO(const RunParams& params)
                                    // to generate an execution run time value
 
   setActualProblemSize( getTargetProblemSize() );  // actual problem size may
-                                                   // be different than the 
+                                                   // be different than the
                                                    // default size based on
                                                    // user-provided run time
                                                    // options
- 
+
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
   setBytesPerRep( ... );  // value set based on data read and written when
@@ -530,32 +608,34 @@ owned by the class object as needed. Often, it is empty.
 
 ##### setUp() method
 
-The `setUp()` method is responsible for allocating and initializing data 
-necessary to run the kernel for the variant specified by its variant ID 
+The `setUp()` method is responsible for allocating and initializing data
+necessary to run the kernel for the variant specified by its variant ID
 argument. For example, a baseline variant may have aligned data allocation
 to help enable SIMD optimizations, an OpenMP variant may initialize arrays
-following a pattern of "first touch" based on how memory and threads are 
-mapped to CPU cores, a CUDA variant may initialize data in host memory, 
+following a pattern of "first touch" based on how memory and threads are
+mapped to CPU cores, a CUDA variant may initialize data in host memory,
 which will be copied to device memory when a CUDA variant executes, etc.
 
 It is important to use the same data allocation and initialization operations
 for all kernel variants so that checksums can be compared at the end of a run.
 
-Note: to simplify these operations and help ensure consistency, there exist 
+Note: to simplify these operations and help ensure consistency, there exist
 utility methods to allocate, initialize, deallocate, and copy data, and compute
 checksums defined in the `DataUtils.hpp` `CudaDataUtils.hpp`,
 `OpenMPTargetDataUtils.hpp`, etc. header files in the `common` directory.
 
 ##### run methods
 
-Which files contain which 'run' methods and associated variant implementations 
-is described above. Each method takes a variant ID argument which identifies
-the variant to be run. Each method is also responsible for calling base class 
-methods to start and stop execution timers when a loop variant is run. A 
-typical kernel execution code section may look like:
+Which files contain which 'run' methods and associated variant and tuning
+implementations is described above. Each method takes a variant ID argument
+which identifies the variant to be run and a tuning index which identifies
+the tuning of the variant to run. Note that the tuning index can be ignored
+when there is only one tuning. Each method is also responsible for calling base
+class methods to start and stop execution timers when a loop variant is run.
+A typical kernel execution code section may look like:
 
 ```cpp
-void Foo::runSeqVariant(VariantID vid)
+void Foo::runSeqVariant(VariantID vid, size_t /*tuning_idx*/)
 {
   const Index_type run_reps = getRunReps();
   // ...
@@ -572,7 +652,7 @@ void Foo::runSeqVariant(VariantID vid)
       }
       stopTimer();
 
-      break; 
+      break;
     }
 
 #if defined(RUN_RAJA_SEQ)
@@ -581,7 +661,7 @@ void Foo::runSeqVariant(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        // Implementation of Lambda_Seq kernel variant... 
+        // Implementation of Lambda_Seq kernel variant...
 
       }
       stopTimer();
@@ -604,7 +684,7 @@ void Foo::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  <kernel-name> : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  <kernel-name> : Unknown variant id = " << vid << std::endl;
     }
 
   }
@@ -616,61 +696,66 @@ pattern will ensure all new additions are consistent.
 
 Important notes:
 
-  * As mentioned earlier, there are multiple source files for each kernel.  
-    The reason for this is that it makes it easier to apply unique compiler 
-    flags to different variants and to manage compilation and linking issues 
-    that arise when some kernel variants are combined in the same translation 
+  * As mentioned earlier, there are multiple source files for each kernel.
+    The reason for this is that it makes it easier to apply unique compiler
+    flags to different variants and to manage compilation and linking issues
+    that arise when some kernel variants are combined in the same translation
     unit.
 
-  * For convenience, we make heavy use of macros to define data declarations 
+  * For convenience, we make heavy use of macros to define data declarations
     and kernel bodies in the Suite. While seemingly cryptic, this significantly
     reduces the amount of redundant code required to implement multiple variants
-    for each kernel and make sure things are the same as much as possible. The 
-    kernel class implementation files in the Suite provide many examples of 
+    for each kernel and make sure things are the same as much as possible. The
+    kernel class implementation files in the Suite provide many examples of
     the basic pattern we use.
 
+  * We also use macros to define some methods used with GPU block size tunings.
+    While seemingly cryptic, this significantly reduces the amount of redundant
+    code required to implement calling and naming each of the multiple tunings
+    for each kernel and make sure things are the same as much as possible.
+
 ##### updateChecksum() method
 
 The `updateChecksum()` method is responsible for adding the checksum
-for the current kernel (based on the data the kernel computes) to the 
-checksum value for the variant of the kernel just executed, which is held 
-in the KernelBase base class object. 
+for the current kernel (based on the data the kernel computes) to the
+checksum value for the variant and tuning of the kernel just executed, which is
+held in the KernelBase base class object.
 
 It is important that the checksum be computed in the same way for
-each variant of the kernel so that checksums for different variants can be 
-compared to help identify differences, and potential errors in 
+each variant of the kernel so that checksums for different variants can be
+compared to help identify differences, and potential errors in
 implementations, compiler optimizations, programming model execution, etc.
 
-Note: to simplify checksum computations and help ensure consistency, there 
+Note: to simplify checksum computations and help ensure consistency, there
 are methods to compute checksums, a weighted sum of array values for example,
 are defined in the `DataUtils.hpp` header file in the `common` directory.
 
 ##### tearDown() method
 
 The `tearDown()` method frees and/or resets all kernel data that is
-allocated and/or initialized in the `setUp()` method execution to prepare for 
+allocated and/or initialized in the `setUp()` method execution to prepare for
 other kernel variants run subsequently.
 
 
 ### Add object construction operation
 
-The `Executor` class in the `common` directory is responsible for creating 
-kernel objects for the kernels to be run based on the Suite input options. 
-To ensure a new kernel object will be created properly, add a call to its 
-class constructor based on its `KernelID` in the `getKernelObject()` 
+The `Executor` class in the `common` directory is responsible for creating
+kernel objects for the kernels to be run based on the Suite input options.
+To ensure a new kernel object will be created properly, add a call to its
+class constructor based on its `KernelID` in the `getKernelObject()`
 method in the `RAJAPerfSuite.cpp` file.
 
-  
+
 ## Adding a variant
 
 Each variant in the RAJA Performance Suite is identified by an enumeration
 value and a string name. Adding a new variant requires adding these two
-items similarly to adding those for a kernel as described above. 
+items similarly to adding those for a kernel as described above.
 
 ### Add the variant ID and name
 
-First, add an enumeration value identifier for the variant, that is unique 
-among all variants, in the enum 'VariantID' in the header file 
+First, add an enumeration value identifier for the variant, that is unique
+among all variants, in the enum 'VariantID' in the header file
 `RAJAPerfSuite.hpp`:
 
 ```cpp
@@ -701,32 +786,49 @@ and matching one-to-one).
 
 ### Add kernel variant implementations
 
-In the classes containing kernels to which the new variant applies, 
-add implementations for the variant in the setup, kernel execution, 
-checksum computation, and teardown methods as needed. Also, make sure to 
-define the variant for those kernels in the kernel class constructors by 
-calling `setVariantDefined(NewVariant)` so that the variant can be run. 
-These operations are described in earlier sections for adding a new kernel 
+In the classes containing kernels to which the new variant applies,
+add implementations for the variant in the setup, kernel execution,
+checksum computation, and teardown methods as needed. Also, make sure to
+define the variant for those kernels in the kernel class constructors by
+calling `setVariantDefined(NewVariant)` so that the variant can be run.
+These operations are described in earlier sections for adding a new kernel
 above.
 
+### Add kernel tuning implementations
+
+In the classes containing kernels to which the new tuning applies,
+add implementations for the tuning in the kernel execution and tuning naming
+methods as needed. Note that the tuning indices are determined by the order that
+the tuning names are added in the `set*TuningDefinitions` method. Therefore
+the `run*Variant` methods should have similar logic in order to run the correct
+tuning based on the index.
+
+* * *
+
+# Continuous Integration
+
+RAJAPerf Suite uses continuous integration to ensure that changes added to the repository are well integrated and tested for compatibility with the rest of the existing code base. Our CI tests include a variety of vetted configurations that run on different LC machines.
+
+RAJAPerf Suite shares its Gitlab CI workflow with other projects. The documentation is therefore [shared](https://radiuss-ci.readthedocs.io/en/latest).
+
 * * *
 
 # Contributions
 
-The RAJA Performance Suite is a work-in-progress, with new kernels and variants 
-added as new features and back-end support are developed in RAJA. We encourage 
-interested parties to contribute to it so that C++ compiler optimizations and 
-support for programming models like RAJA continue to improve. 
+The RAJA Performance Suite is a work-in-progress, with new kernels and variants
+added as new features and back-end support are developed in RAJA. We encourage
+interested parties to contribute to it so that C++ compiler optimizations and
+support for programming models like RAJA continue to improve.
 
 The Suite developers follow the [GitFlow](http://nvie.com/posts/a-successful-git-branching-model/) development model. Folks wishing to contribute to the Suite,
-should include their work in a feature branch created from the Performance 
-Suite `develop` branch. Then, create a pull request with the `develop` branch 
-as the destination when it is ready to be reviewed. The `develop` branch 
+should include their work in a feature branch created from the Performance
+Suite `develop` branch. Then, create a pull request with the `develop` branch
+as the destination when it is ready to be reviewed. The `develop` branch
 contains the latest work in RAJA Performance Suite. Periodically, we merge the
 develop branch into the `main` branch and tag a new release.
 
-If you would like to contribute to the RAJA Performance Suite, or have 
-questions about doing so, please contact the maintainer of the Suite listed 
+If you would like to contribute to the RAJA Performance Suite, or have
+questions about doing so, please contact the maintainer of the Suite listed
 below.
 
 * * *
@@ -737,7 +839,7 @@ The primary developer/maintainer of the RAJA Performance Suite:
 
   * Rich Hornung (hornung1@llnl.gov)
 
-Please see the {RAJA Performance Suite Contributors Page](https://github.com/LLNL/RAJAPerf/graphs/contributors), to see the full list of contributors to the 
+Please see the {RAJA Performance Suite Contributors Page](https://github.com/LLNL/RAJAPerf/graphs/contributors), to see the full list of contributors to the
 project.
 
 * * *
@@ -776,18 +878,18 @@ text in the license header:
 
 # External Packages
 
-The RAJA Performance Suite has some external dependencies, which are included 
+The RAJA Performance Suite has some external dependencies, which are included
 as Git submodules. These packages are covered by various permissive licenses.
 A summary listing follows. See the license included with each package for
 full details.
 
-PackageName: BLT  
-PackageHomePage: https://github.com/LLNL/blt/  
+PackageName: BLT
+PackageHomePage: https://github.com/LLNL/blt/
 PackageLicenseDeclared: BSD-3-Clause
 
-PackageName: RAJA  
-PackageHomePage: http://github.com/LLNL/RAJA/  
-PackageLicenseDeclared: BSD-3-Clause 
+PackageName: RAJA
+PackageHomePage: http://github.com/LLNL/RAJA/
+PackageLicenseDeclared: BSD-3-Clause
 
 * * *
 
diff --git a/RELEASE b/RELEASE
index 69efe8772..9096758f6 100644
--- a/RELEASE
+++ b/RELEASE
@@ -2,7 +2,7 @@
 
 RAJA Performance Suite: ................................, version 0.11.0
 
-Copyright (c) 2017-2021, Lawrence Livermore National Security, LLC. 
+Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC. 
 Produced at the Lawrence Livermore National Laboratory.
 All rights reserved. See details in the RAJAPerf/LICENSE file.
 
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 02d749987..cfd56b94e 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -5,7 +5,7 @@ variables:
   COMPILER: 'g++'
 
 jobs:
-#- job: Windows #Commenting out until windows builds are supported.
+#- job: Windows #temporarily commenting out until cmake/azure version issue resolved
 #  strategy:
 #    matrix:
 #      shared:
@@ -15,7 +15,7 @@ jobs:
 #  pool:
 #    vmImage: 'windows-2019'
 #  variables:
-#    CMAKE_EXTRA_FLAGS: '-DENABLE_WARNINGS_AS_ERRORS=Off -DBLT_CXX_STD="" -DCMAKE_CXX_STANDARD=17'
+#    CMAKE_EXTRA_FLAGS: '-DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off -DBLT_CXX_STD="" -DCMAKE_CXX_STANDARD=17'
 #  steps:
 #  - checkout: self
 #    clean: boolean
@@ -27,38 +27,49 @@ jobs:
 #  - task: CMake@1
 #    inputs:
 #      workingDir: 'build'
-#      cmakeArgs: '--build . --config Release --verbose'
+#      cmakeArgs: '--build . --config Release --verbose -j 4'
+#   - task: CmdLine@2
+#     inputs:
+#       script: 'ctest.exe -T test -C Release'
+#       workingDirectory: 'build'
+#     condition: eq( variables['Agent.OS'], 'Windows_NT')
+#   - task: PublishTestResults@2
+#     inputs:
+#       testResultsFormat: 'cTest'
+#       testResultsFiles: '**/Test.xml'
 - job: Docker
   timeoutInMinutes: 360
   strategy:
     matrix: 
-      gcc5: 
-        docker_target: gcc5
-      gcc5-debug: 
-        docker_target: gcc5-debug
-      gcc6:
-        docker_target: gcc6
       gcc7:
         docker_target: gcc7
       gcc8:
         docker_target: gcc8
-      clang9:
-        docker_target: clang9
-      clang9-debug:
-        docker_target: clang9-debug
+      gcc9:
+        docker_target: gcc9
+      gcc11:
+        docker_target: gcc11
+      clang11:
+        docker_target: clang11
+##      clang11-debug:
+##        docker_target: clang11-debug
+      clang13:
+        docker_target: clang13
       nvcc10:
         docker_target: nvcc10
-      nvcc10-debug:
-        docker_target: nvcc10-debug
+      nvcc11:
+        docker_target: nvcc11
+##      nvcc11-debug:
+##        docker_target: nvcc11-debug
       hip:
         docker_target: hip
-#      sycl:
-#        docker_target: sycl
+#     sycl:
+#       docker_target: sycl
   pool:
     vmImage: 'ubuntu-latest'
   variables:
     DOCKER_BUILDKIT: '1'
-    CMAKE_EXTRA_FLAGS: '-DENABLE_DEVELOPER_BENCHMARKS=On -DENABLE_DEVELOPER_DEFAULTS=On -DCMAKE_CXX_STANDARD=11'
+    CMAKE_EXTRA_FLAGS: '-DENABLE_DEVELOPER_BENCHMARKS=On -DENABLE_DEVELOPER_DEFAULTS=On -DCMAKE_CXX_STANDARD=14'
   steps:
   - checkout: self
     clean: boolean
@@ -68,10 +79,6 @@ jobs:
       command: build
       dockerFile: 'Dockerfile'
       arguments: '--target $(docker_target)'
-  - script: |
-      bash <(curl -s https://raw.githubusercontent.com/codecov/codecov-bash/0b376529f626b50b7d4a9fb734e0e50d28b9b91e/codecov) >& /dev/null
-    displayName: 'Upload code coverage'
-    condition: eq( variables['docker_target'], 'gcc')
 - job: Mac 
   pool:
     vmImage: 'macOS-latest'
@@ -87,11 +94,11 @@ jobs:
       cmakeArgs: '$(CMAKE_EXTRA_FLAGS) ../'
   - script: |
       cd build
-      make
+      make -j 4
     displayName: 'OSX Build'
     condition: eq( variables['Agent.OS'], 'Darwin')
   - script: |
       cd build
-      ./bin/raja-perf.exe
+      ./bin/raja-perf.exe --checkrun 5 -sp
     displayName: 'Run Perf Suite'
     condition: eq( variables['Agent.OS'], 'Darwin')
diff --git a/blt b/blt
index ddd5a0ca7..296bf64e6 160000
--- a/blt
+++ b/blt
@@ -1 +1 @@
-Subproject commit ddd5a0ca7c566d0ae14270b66625c8a363630ddb
+Subproject commit 296bf64e64edfcfcce6a53e3b396d6529e76b986
diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh
new file mode 100755
index 000000000..324f964ad
--- /dev/null
+++ b/scripts/gitlab/build_and_test.sh
@@ -0,0 +1,235 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-21, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+set -o errexit
+set -o nounset
+
+option=${1:-""}
+hostname="$(hostname)"
+truehostname=${hostname//[0-9]/}
+project_dir="$(pwd)"
+
+build_root=${BUILD_ROOT:-""}
+hostconfig=${HOST_CONFIG:-""}
+spec=${SPEC:-""}
+job_unique_id=${CI_JOB_ID:-""}
+raja_version=${UPDATE_RAJA:-""}
+
+sys_type=${SYS_TYPE:-""}
+py_env_path=${PYTHON_ENVIRONMENT_PATH:-""}
+
+# Dependencies
+date
+if [[ "${option}" != "--build-only" && "${option}" != "--test-only" ]]
+then
+    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    echo "~~~~~ Building Dependencies"
+    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+
+    if [[ -z ${spec} ]]
+    then
+        echo "SPEC is undefined, aborting..."
+        exit 1
+    fi
+
+    prefix_opt=""
+
+    if [[ -d /dev/shm ]]
+    then
+        prefix="/dev/shm/${hostname}"
+        if [[ -z ${job_unique_id} ]]; then
+          job_unique_id=manual_job_$(date +%s)
+          while [[ -d ${prefix}/${job_unique_id} ]] ; do
+              sleep 1
+              job_unique_id=manual_job_$(date +%s)
+          done
+        fi
+
+        prefix="${prefix}/${job_unique_id}"
+        mkdir -p ${prefix}
+        prefix_opt="--prefix=${prefix}"
+    fi
+
+    python3 tpl/RAJA/scripts/uberenv/uberenv.py --project-json=".uberenv_config.json" --spec="${spec}" ${prefix_opt}
+
+    mv ${project_dir}/tpl/RAJA/hc-*.cmake ${project_dir}/.
+
+fi
+date
+
+# Host config file
+if [[ -z ${hostconfig} ]]
+then
+    # If no host config file was provided, we assume it was generated.
+    # This means we are looking of a unique one in project dir.
+    hostconfigs=( $( ls "${project_dir}/"hc-*.cmake ) )
+    if [[ ${#hostconfigs[@]} == 1 ]]
+    then
+        hostconfig_path=${hostconfigs[0]}
+        echo "Found host config file: ${hostconfig_path}"
+    elif [[ ${#hostconfigs[@]} == 0 ]]
+    then
+        echo "No result for: ${project_dir}/hc-*.cmake"
+        echo "Spack generated host-config not found."
+        exit 1
+    else
+        echo "More than one result for: ${project_dir}/hc-*.cmake"
+        echo "${hostconfigs[@]}"
+        echo "Please specify one with HOST_CONFIG variable"
+        exit 1
+    fi
+else
+    # Using provided host-config file.
+    hostconfig_path="${project_dir}/host-configs/${hostconfig}"
+fi
+
+# Build Directory
+if [[ -z ${build_root} ]]
+then
+    build_root=$(pwd)
+fi
+
+build_dir="${build_root}/build_${hostconfig//.cmake/}"
+
+# Build
+if [[ "${option}" != "--deps-only" && "${option}" != "--test-only" ]]
+then
+    date
+    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    echo "~ Host-config: ${hostconfig_path}"
+    echo "~ Build Dir:   ${build_dir}"
+    echo "~ Project Dir: ${project_dir}"
+    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    echo ""
+    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    echo "~~~~ ENV ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+
+    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    echo "~~~~~ Building RAJA PerfSuite"
+    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+
+    # Map CPU core allocations
+    declare -A core_counts=(["lassen"]=40 ["ruby"]=28 ["corona"]=32)
+
+    # If using Multi-project, set up the submodule
+    if [[ -n ${raja_version} ]]
+    then
+      cd tpl/RAJA  
+      echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+      echo "~~~~ Updating RAJA Submodule to develop ~~~"
+      echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+      git pull origin develop
+      echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+      echo "~~~~ Updating Submodules within RAJA ~~~~~~"
+      echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+      git submodule update --init --recursive
+      cd -
+    fi
+
+    # If building, then delete everything first
+    # NOTE: 'cmake --build . -j core_counts' attempts to reduce individual build resources.
+    #       If core_counts does not contain hostname, then will default to '-j ', which should
+    #       use max cores.
+    rm -rf ${build_dir} 2>/dev/null
+    mkdir -p ${build_dir} && cd ${build_dir}
+
+    date
+
+    if [[ "${truehostname}" == "corona" ]]
+    then
+        module unload rocm
+    fi
+
+    cmake \
+      -C ${hostconfig_path} \
+      ${project_dir}
+    if echo ${spec} | grep -q "intel" ; then
+        cmake --build . -j 16
+        echo "~~~~~~~~~ Build Command: ~~~~~~~~~~~~~~~~~~~~~"
+        echo "cmake --build . -j 16"
+        echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    else
+        cmake --build . -j ${core_counts[$truehostname]}
+        echo "~~~~~~~~~ Build Command: ~~~~~~~~~~~~~~~~~~~~~"
+        echo "cmake --build . -j ${core_counts[$truehostname]}"
+        echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    fi
+    date
+fi
+
+if [[ ! -d ${build_dir} ]]
+then
+    echo "ERROR: Build directory not found : ${build_dir}" && exit 1
+fi
+
+cd ${build_dir}
+
+echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+echo "~~~~~ TESTING RAJAPERF SUITE"
+echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+
+if grep -q -i "ENABLE_TESTS.*ON" ${hostconfig_path}
+then
+
+    #
+    # Maintaining separate, but identical release and debug sections 
+    # in case we want to make them disctinct in the future.
+    #
+
+    if echo ${sys_type} | grep -q "blueos" && echo ${spec} | grep -q "cuda" ; then
+        if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path}
+        then
+            echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~"
+            echo "lrun -n1 ... ctest --output-on-failure -T test"
+            echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+            lrun -n1 --smpiargs="-disable_gpu_hooks" ctest --output-on-failure -T test
+        else
+            echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~"
+            echo "lrun -n1 ... ctest --output-on-failure -T test"
+            echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+            lrun -n1 --smpiargs="-disable_gpu_hooks" ctest --output-on-failure -T test
+        fi
+    else
+        if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path}
+        then
+            echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~"
+            echo "ctest --output-on-failure -T test 2>&1 | tee tests_output.txt"
+            echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+            ctest --output-on-failure -T test 2>&1 | tee tests_output.txt
+        else
+            echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~"
+            echo "ctest --output-on-failure -T test 2>&1 | tee tests_output.txt"
+            echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+            ctest --output-on-failure -T test 2>&1 | tee tests_output.txt
+        fi
+    fi
+
+    no_test_str="No tests were found!!!"
+    if [[ "$(tail -n 1 tests_output.txt)" == "${no_test_str}" ]]
+    then
+        echo "ERROR: No tests were found" && exit 1
+    fi
+
+    echo "Copying Testing xml reports for export"
+    tree Testing
+    xsltproc -o junit.xml ${project_dir}/blt/tests/ctest-to-junit.xsl Testing/*/Test.xml
+    mv junit.xml ${project_dir}/junit.xml
+
+    if grep -q "Errors while running CTest" ./tests_output.txt
+    then
+        echo "ERROR: failure(s) while running CTest" && exit 1
+    fi
+fi
+
+echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+echo "~~~~~ CLEAN UP"
+echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+make clean
+
diff --git a/scripts/install_llvm.sh b/scripts/install_llvm.sh
index 6d1197004..60bfccd39 100755
--- a/scripts/install_llvm.sh
+++ b/scripts/install_llvm.sh
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors. 
 # See the RAJAPerf/LICENSE file for details.
 #
diff --git a/scripts/lc-builds/blueos_clang.sh b/scripts/lc-builds/blueos_clang.sh
index 932108c62..2329cca3c 100755
--- a/scripts/lc-builds/blueos_clang.sh
+++ b/scripts/lc-builds/blueos_clang.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
@@ -36,6 +36,7 @@ module load cmake/3.14.5
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \
+  -DBLT_CXX_STD=c++14 \
   -C ${RAJA_HOSTCONFIG} \
   -DENABLE_OPENMP=On \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
diff --git a/scripts/lc-builds/blueos_clang_omptarget.sh b/scripts/lc-builds/blueos_clang_omptarget.sh
index cfce68a7f..e557c2dac 100755
--- a/scripts/lc-builds/blueos_clang_omptarget.sh
+++ b/scripts/lc-builds/blueos_clang_omptarget.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
@@ -36,6 +36,7 @@ module load cmake/3.14.5
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \
+  -DBLT_CXX_STD=c++14 \
   -C ${RAJA_HOSTCONFIG} \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=Off \
diff --git a/scripts/lc-builds/blueos_gcc.sh b/scripts/lc-builds/blueos_gcc.sh
index f3cfbcc94..b3ecbeb70 100755
--- a/scripts/lc-builds/blueos_gcc.sh
+++ b/scripts/lc-builds/blueos_gcc.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
@@ -34,6 +34,7 @@ module load cmake/3.14.5
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \
+  -DBLT_CXX_STD=c++14 \
   -C ${RAJA_HOSTCONFIG} \
   -DENABLE_OPENMP=On \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
diff --git a/scripts/lc-builds/blueos_nvcc_clang.sh b/scripts/lc-builds/blueos_nvcc_clang.sh
index 96a8c1421..105938283 100755
--- a/scripts/lc-builds/blueos_nvcc_clang.sh
+++ b/scripts/lc-builds/blueos_nvcc_clang.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
@@ -41,7 +41,7 @@ module load cmake/3.14.5
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_CLANG_VER}/bin/clang++ \
-  -DBLT_CXX_STD=c++11 \
+  -DBLT_CXX_STD=c++14 \
   -C ${RAJA_HOSTCONFIG} \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
diff --git a/scripts/lc-builds/blueos_nvcc_gcc.sh b/scripts/lc-builds/blueos_nvcc_gcc.sh
index 407ccf88b..46f64ee17 100755
--- a/scripts/lc-builds/blueos_nvcc_gcc.sh
+++ b/scripts/lc-builds/blueos_nvcc_gcc.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
@@ -41,7 +41,7 @@ module load cmake/3.14.5
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_GCC_VER}/bin/g++ \
-  -DBLT_CXX_STD=c++11 \
+  -DBLT_CXX_STD=c++14 \
   -C ${RAJA_HOSTCONFIG} \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
diff --git a/scripts/lc-builds/blueos_nvcc_xl.sh b/scripts/lc-builds/blueos_nvcc_xl.sh
index 1263c1412..950505cfc 100755
--- a/scripts/lc-builds/blueos_nvcc_xl.sh
+++ b/scripts/lc-builds/blueos_nvcc_xl.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
@@ -41,7 +41,7 @@ module load cmake/3.14.5
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-${COMP_XL_VER}/bin/xlc++_r \
-  -DBLT_CXX_STD=c++11 \
+  -DBLT_CXX_STD=c++14 \
   -C ${RAJA_HOSTCONFIG} \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
diff --git a/scripts/lc-builds/blueos_pgi.sh b/scripts/lc-builds/blueos_pgi.sh
index a49a546c9..d6c915fb9 100755
--- a/scripts/lc-builds/blueos_pgi.sh
+++ b/scripts/lc-builds/blueos_pgi.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
@@ -34,6 +34,7 @@ module load cmake/3.14.5
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/pgi/pgi-${COMP_VER}/bin/pgc++ \
+  -DBLT_CXX_STD=c++14 \
   -DCMAKE_C_COMPILER=/usr/tce/packages/pgi/pgi-${COMP_VER}/bin/pgcc \
   -C ${RAJA_HOST_CONFIG} \
   -DENABLE_OPENMP=On \
diff --git a/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh
new file mode 100755
index 000000000..83bcb2903
--- /dev/null
+++ b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [[ $# -lt 4 ]]; then
+  echo
+  echo "You must pass 4 arguments to the script (in this order): "
+  echo "   1) compiler version number for spectrum mpi"
+  echo "   2) compiler version number for nvcc"
+  echo "   3) CUDA compute architecture"
+  echo "   4) compiler version number for clang. "
+  echo
+  echo "For example: "
+  echo "    blueos_nvcc_clang.sh rolling-release 10.2.89 sm_70 10.0.1"
+  exit
+fi
+
+COMP_MPI_VER=$1
+COMP_NVCC_VER=$2
+COMP_ARCH=$3
+COMP_CLANG_VER=$4
+shift 4
+
+BUILD_SUFFIX=lc_blueos-spectrum${COMP_MPI_VER}-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-clang${COMP_CLANG_VER}
+RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_clang_X.cmake
+
+echo
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX} >/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.14.5
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DMPI_CXX_COMPILER=/usr/tce/packages/spectrum-mpi/spectrum-mpi-${COMP_MPI_VER}-clang-${COMP_CLANG_VER}/bin/mpiclang++ \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_CLANG_VER}/bin/clang++ \
+  -DBLT_CXX_STD=c++14 \
+  -C ${RAJA_HOSTCONFIG} \
+  -DENABLE_MPI=On \
+  -DENABLE_OPENMP=On \
+  -DENABLE_CUDA=On \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \
+  -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \
+  -DCUDA_ARCH=${COMP_ARCH} \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
+
+echo
+echo "***********************************************************************"
+echo
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite"
+echo
+echo "  Please note that you have to run with mpi when you run"
+echo "  the RAJA Perf Suite; for example,"
+echo
+echo "    lrun -n4 ./bin/raja-perf.exe"
+echo
+echo "***********************************************************************"
diff --git a/scripts/lc-builds/blueos_xl.sh b/scripts/lc-builds/blueos_xl.sh
index 6f4d961b6..8630e419d 100755
--- a/scripts/lc-builds/blueos_xl.sh
+++ b/scripts/lc-builds/blueos_xl.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
@@ -34,7 +34,7 @@ module load cmake/3.14.5
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-${COMP_VER}/bin/xlc++_r \
-  -DBLT_CXX_STD=c++11 \
+  -DBLT_CXX_STD=c++14 \
   -C ${RAJA_HOSTCONFIG} \
   -DENABLE_OPENMP=On \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
diff --git a/scripts/lc-builds/blueos_xl_omptarget.sh b/scripts/lc-builds/blueos_xl_omptarget.sh
index 42c4b5844..9d18d4622 100755
--- a/scripts/lc-builds/blueos_xl_omptarget.sh
+++ b/scripts/lc-builds/blueos_xl_omptarget.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
@@ -34,7 +34,7 @@ module load cmake/3.14.5
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-${COMP_VER}/bin/xlc++_r \
-  -DBLT_CXX_STD=c++11 \
+  -DBLT_CXX_STD=c++14 \
   -C ${RAJA_HOSTCONFIG} \
   -DENABLE_OPENMP=On \
   -DENABLE_TARGET_OPENMP=On \
diff --git a/scripts/lc-builds/toss3_clang.sh b/scripts/lc-builds/toss3_clang.sh
index d00d63217..e3b51716d 100755
--- a/scripts/lc-builds/toss3_clang.sh
+++ b/scripts/lc-builds/toss3_clang.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
@@ -34,6 +34,7 @@ module load cmake/3.14.5
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \
+  -DBLT_CXX_STD=c++14 \
   -C ${RAJA_HOSTCONFIG} \
   -DENABLE_OPENMP=On \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
diff --git a/scripts/lc-builds/toss3_gcc.sh b/scripts/lc-builds/toss3_gcc.sh
index 2851ea4af..031b01a7b 100755
--- a/scripts/lc-builds/toss3_gcc.sh
+++ b/scripts/lc-builds/toss3_gcc.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
@@ -34,6 +34,7 @@ module load cmake/3.14.5
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \
+  -DBLT_CXX_STD=c++14 \
   -C ${RAJA_HOSTCONFIG} \
   -DENABLE_OPENMP=On \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
diff --git a/scripts/lc-builds/toss3_hipcc.sh b/scripts/lc-builds/toss3_hipcc.sh
index e515804ea..464c8390f 100755
--- a/scripts/lc-builds/toss3_hipcc.sh
+++ b/scripts/lc-builds/toss3_hipcc.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
@@ -45,6 +45,7 @@ cmake \
   -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/clang \
   -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/clang++ \
   -DHIP_HIPCC_FLAGS=--offload-arch=${COMP_ARCH} \
+  -DBLT_CXX_STD=c++14 \ 
   -C ${RAJA_HOSTCONFIG} \
   -DENABLE_HIP=ON \
   -DENABLE_OPENMP=OFF \
diff --git a/scripts/lc-builds/toss3_icpc.sh b/scripts/lc-builds/toss3_icpc.sh
index 207a892b2..9c941742f 100755
--- a/scripts/lc-builds/toss3_icpc.sh
+++ b/scripts/lc-builds/toss3_icpc.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
@@ -48,7 +48,7 @@ cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/bin/icpc \
   -DCMAKE_C_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/bin/icc \
-  -DBLT_CXX_STD=c++11 \
+  -DBLT_CXX_STD=c++14 \
   -C ${RAJA_HOSTCONFIG} \
   -DENABLE_OPENMP=On \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
diff --git a/scripts/lc-builds/toss3_mvapich2_gcc.sh b/scripts/lc-builds/toss3_mvapich2_gcc.sh
new file mode 100755
index 000000000..654f9624f
--- /dev/null
+++ b/scripts/lc-builds/toss3_mvapich2_gcc.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [ "$1" == "" ]; then
+  echo
+  echo "You must pass a compiler version number to script. For example,"
+  echo "    toss3_mvapich2_gcc.sh 2.3 10.2.1"
+  exit
+fi
+
+MPI_VER=$1
+COMP_VER=$2
+shift 2
+
+BUILD_SUFFIX=lc_toss3-mvapich2-${MPI_VER}-gcc-${COMP_VER}
+RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/gcc_X.cmake
+
+echo
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX} 2>/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.14.5
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DMPI_CXX_COMPILER=/usr/tce/packages/mvapich2/mvapich2-${MPI_VER}-gcc-${COMP_VER}/bin/mpic++ \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \
+  -DBLT_CXX_STD=c++14 \
+  -C ${RAJA_HOSTCONFIG} \
+  -DENABLE_MPI=On \
+  -DENABLE_OPENMP=On \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
+
+echo
+echo "***********************************************************************"
+echo
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite"
+echo
+echo "  Please note that you have to run with mpi when you run"
+echo "  the RAJA Perf Suite; for example,"
+echo
+echo "    srun -n2 ./bin/raja-perf.exe"
+echo
+echo "***********************************************************************"
diff --git a/scripts/lc-builds/toss3_pgi.sh b/scripts/lc-builds/toss3_pgi.sh
index 0c0e33a81..cd778d5fe 100755
--- a/scripts/lc-builds/toss3_pgi.sh
+++ b/scripts/lc-builds/toss3_pgi.sh
@@ -35,6 +35,7 @@ cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/pgi/pgi-${COMP_VER}/bin/pgc++ \
   -DCMAKE_C_COMPILER=/usr/tce/packages/pgi/pgi-${COMP_VER}/bin/pgcc \
+  -DBLT_CXX_STD=c++14 \
   -C ${RAJA_HOSTCONFIG} \
   -DENABLE_OPENMP=On \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
diff --git a/scripts/lc-builds/toss4_amdclang.sh b/scripts/lc-builds/toss4_amdclang.sh
new file mode 100755
index 000000000..4b063be04
--- /dev/null
+++ b/scripts/lc-builds/toss4_amdclang.sh
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [[ $# -lt 2 ]]; then
+  echo
+  echo "You must pass 2 or more arguments to the script (in this order): "
+  echo "   1) compiler version number"
+  echo "   2) HIP compute architecture"
+  echo "   3...) optional arguments to cmake"
+  echo
+  echo "For example: "
+  echo "    toss4_amdclang.sh 4.1.0 gfx906"
+  exit
+fi
+
+COMP_VER=$1
+COMP_ARCH=$2
+shift 2
+
+HOSTCONFIG="hip_3_X"
+
+if [[ ${COMP_VER} == 4.* ]]
+then
+##HIP_CLANG_FLAGS="-mllvm -amdgpu-fixed-function-abi=1"
+  HOSTCONFIG="hip_4_link_X"
+elif [[ ${COMP_VER} == 3.* ]]
+then
+  HOSTCONFIG="hip_3_X"
+else
+  echo "Unknown hip version, using ${HOSTCONFIG} host-config"
+fi
+
+BUILD_SUFFIX=lc_toss4-amdclang-${COMP_VER}-${COMP_ARCH}
+RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/${HOSTCONFIG}.cmake
+
+echo
+echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX} >/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+
+module load cmake/3.14.5
+
+# unload rocm to avoid configuration problems where the loaded rocm and COMP_VER
+# are inconsistent causing the rocprim from the module to be used unexpectedly
+module unload rocm
+
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DROCM_ROOT_DIR="/opt/rocm-${COMP_VER}" \
+  -DHIP_ROOT_DIR="/opt/rocm-${COMP_VER}/hip" \
+  -DHIP_PATH=/opt/rocm-${COMP_VER}/llvm/bin \
+  -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/amdclang \
+  -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/amdclang++ \
+  -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \
+  -DBLT_CXX_STD=c++14 \
+  -C ${RAJA_HOSTCONFIG} \
+  -DENABLE_HIP=ON \
+  -DENABLE_OPENMP=OFF \
+  -DENABLE_CUDA=OFF \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
+
+echo
+echo "***********************************************************************"
+echo
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA"
+echo
+echo "  Please note that you have to have a consistent build environment"
+echo "  when you make RAJA as cmake may reconfigure; unload the rocm module"
+echo "  or load the appropriate rocm module (${COMP_VER}) when building."
+echo
+echo "    module unload rocm"
+echo "    srun -n1 make"
+echo
+echo "***********************************************************************"
diff --git a/scripts/make_release_tarball.sh b/scripts/make_release_tarball.sh
index cbd4b4fa7..6b0a3b804 100755
--- a/scripts/make_release_tarball.sh
+++ b/scripts/make_release_tarball.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors. 
 # See the RAJAPerf/LICENSE file for details.
 #
diff --git a/scripts/spack_packages/hip/package.py b/scripts/spack_packages/hip/package.py
new file mode 100644
index 000000000..f99d26dc6
--- /dev/null
+++ b/scripts/spack_packages/hip/package.py
@@ -0,0 +1,55 @@
+from spack import *
+
+
+class Hip(CMakePackage):
+    """HIP is a C++ Runtime API and Kernel Language that allows developers to
+       create portable applications for AMD and NVIDIA GPUs from
+       single source code."""
+
+    homepage = "https://github.com/ROCm-Developer-Tools/HIP"
+    url      = "https://github.com/ROCm-Developer-Tools/HIP/archive/refs/tags/rocm-4.0.0.tar.gz"
+
+    maintainers = ['srekolam', 'arjun-raj-kuppala']
+
+    version('4.1.0', sha256='25ad58691456de7fd9e985629d0ed775ba36a2a0e0b21c086bd96ba2fb0f7ed1')
+    version('4.0.0', sha256='0082c402f890391023acdfd546760f41cb276dffc0ffeddc325999fd2331d4e8')
+
+    depends_on('cmake@3:', type='build')
+    depends_on('perl@5.10:', type=('build', 'run'))
+    depends_on('mesa~llvm@18.3:')
+
+    for ver in ['4.0.0', '4.1.0']:
+        depends_on('rocclr@' + ver,  type='build', when='@' + ver)
+        depends_on('hsakmt-roct@' + ver, type='build', when='@' + ver)
+        depends_on('hsa-rocr-dev@' + ver, type='link', when='@' + ver)
+        depends_on('comgr@' + ver, type='build', when='@' + ver)
+        depends_on('llvm-amdgpu@' + ver, type='build', when='@' + ver)
+        depends_on('rocm-device-libs@' + ver, type='build', when='@' + ver)
+        depends_on('rocminfo@' + ver, type='build', when='@' + ver)
+
+    def setup_dependent_package(self, module, dependent_spec):
+        self.spec.hipcc = join_path(self.prefix.bin, 'hipcc')
+
+    @run_before('install')
+    def filter_sbang(self):
+        perl = self.spec['perl'].command
+        kwargs = {'ignore_absent': False, 'backup': False, 'string': False}
+
+        with working_dir('bin'):
+            match = '^#!/usr/bin/perl'
+            substitute = "#!{perl}".format(perl=perl)
+            files = [
+                'hipify-perl', 'hipcc', 'extractkernel',
+                'hipconfig', 'hipify-cmakefile'
+            ]
+            filter_file(match, substitute, *files, **kwargs)
+
+    def cmake_args(self):
+        args = [
+            '-DHIP_COMPILER=clang',
+            '-DHIP_PLATFORM=rocclr',
+            '-DHSA_PATH={0}'.format(self.spec['hsa-rocr-dev'].prefix),
+            '-DLIBROCclr_STATIC_DIR={0}/lib'.format(self.spec['rocclr'].prefix)
+        ]
+        return args
+
diff --git a/scripts/spack_packages/raja_perf/package.py b/scripts/spack_packages/raja_perf/package.py
new file mode 100644
index 000000000..c0ba13602
--- /dev/null
+++ b/scripts/spack_packages/raja_perf/package.py
@@ -0,0 +1,362 @@
+# Copyright 2013-2020 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+
+from spack import *
+
+import socket
+import os
+
+from os import environ as env
+from os.path import join as pjoin
+
+import re
+
+def cmake_cache_entry(name, value, comment=""):
+    """Generate a string for a cmake cache variable"""
+
+    return 'set(%s "%s" CACHE PATH "%s")\n\n' % (name,value,comment)
+
+
+def cmake_cache_string(name, string, comment=""):
+    """Generate a string for a cmake cache variable"""
+
+    return 'set(%s "%s" CACHE STRING "%s")\n\n' % (name,string,comment)
+
+
+def cmake_cache_option(name, boolean_value, comment=""):
+    """Generate a string for a cmake configuration option"""
+
+    value = "ON" if boolean_value else "OFF"
+    return 'set(%s %s CACHE BOOL "%s")\n\n' % (name,value,comment)
+
+
+def get_spec_path(spec, package_name, path_replacements = {}, use_bin = False) :
+    """Extracts the prefix path for the given spack package
+       path_replacements is a dictionary with string replacements for the path.
+    """
+
+    if not use_bin:
+        path = spec[package_name].prefix
+    else:
+        path = spec[package_name].prefix.bin
+
+    path = os.path.realpath(path)
+
+    for key in path_replacements:
+        path = path.replace(key, path_replacements[key])
+
+    return path
+
+
+class RajaPerf(CMakePackage, CudaPackage, ROCmPackage):
+    """RAJAPerf Suite Framework."""
+
+    homepage = "http://software.llnl.gov/RAJAPerf/"
+    git      = "https://github.com/LLNL/RAJAPerf.git"
+
+    version('develop', branch='develop', submodules='True')
+    version('main',  branch='main',  submodules='True')
+    version('0.11.0', tag='v0.11.0', submodules="True")
+    version('0.10.0', tag='v0.10.0', submodules="True")
+    version('0.9.0', tag='v0.9.0', submodules="True")
+    version('0.8.0', tag='v0.8.0', submodules="True")
+    version('0.7.0', tag='v0.7.0', submodules="True")
+    version('0.6.0', tag='v0.6.0', submodules="True")
+    version('0.5.2', tag='v0.5.2', submodules="True")
+    version('0.5.1', tag='v0.5.1', submodules="True")
+    version('0.5.0', tag='v0.5.0', submodules="True")
+    version('0.4.0', tag='v0.4.0', submodules="True")
+
+    variant('openmp', default=True, description='Build OpenMP backend')
+    variant('openmp_target', default=False, description='Build with OpenMP target support')
+    variant('shared', default=False, description='Build Shared Libs')
+    variant('libcpp', default=False, description='Uses libc++ instead of libstdc++')
+    variant('tests', default='basic', values=('none', 'basic', 'benchmarks'),
+            multi=False, description='Tests to run')
+
+    depends_on('cmake@3.9:', type='build')
+    depends_on('blt@0.4.1', type='build', when='@main')
+    depends_on('blt@0.4.1:', type='build')
+
+    conflicts('+openmp', when='+rocm')
+    conflicts('~openmp', when='+openmp_target', msg='OpenMP target requires OpenMP')
+
+    phases = ['hostconfig', 'cmake', 'build', 'install']
+
+    def _get_sys_type(self, spec):
+        sys_type = str(spec.architecture)
+        # if on llnl systems, we can use the SYS_TYPE
+        if "SYS_TYPE" in env:
+            sys_type = env["SYS_TYPE"]
+        return sys_type
+
+    def _get_host_config_path(self, spec):
+        var=''
+        if '+cuda' in spec:
+            var= '-'.join([var,'cuda'])
+        if '+libcpp' in spec:
+            var='-'.join([var,'libcpp'])
+
+        host_config_path = "hc-%s-%s-%s%s-%s.cmake" % (socket.gethostname().rstrip('1234567890'),
+                                               self._get_sys_type(spec),
+                                               spec.compiler,
+                                               var,
+                                               spec.dag_hash())
+        dest_dir = self.stage.source_path
+        host_config_path = os.path.abspath(pjoin(dest_dir, host_config_path))
+        return host_config_path
+
+    def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
+        """
+        This method creates a 'host-config' file that specifies
+        all of the options used to configure and build Umpire.
+
+        For more details about 'host-config' files see:
+            http://software.llnl.gov/conduit/building.html
+
+        Note:
+          The `py_site_pkgs_dir` arg exists to allow a package that
+          subclasses this package provide a specific site packages
+          dir when calling this function. `py_site_pkgs_dir` should
+          be an absolute path or `None`.
+
+          This is necessary because the spack `site_packages_dir`
+          var will not exist in the base class. For more details
+          on this issue see: https://github.com/spack/spack/issues/6261
+        """
+
+        #######################
+        # Compiler Info
+        #######################
+        c_compiler = env["SPACK_CC"]
+        cpp_compiler = env["SPACK_CXX"]
+
+        # Even though we don't have fortran code in our project we sometimes
+        # use the Fortran compiler to determine which libstdc++ to use
+        f_compiler = ""
+        if "SPACK_FC" in env.keys():
+            # even if this is set, it may not exist
+            # do one more sanity check
+            if os.path.isfile(env["SPACK_FC"]):
+                f_compiler = env["SPACK_FC"]
+
+        #######################################################################
+        # By directly fetching the names of the actual compilers we appear
+        # to doing something evil here, but this is necessary to create a
+        # 'host config' file that works outside of the spack install env.
+        #######################################################################
+
+        sys_type = self._get_sys_type(spec)
+
+        ##############################################
+        # Find and record what CMake is used
+        ##############################################
+
+        cmake_exe = spec['cmake'].command.path
+        cmake_exe = os.path.realpath(cmake_exe)
+
+        host_config_path = self._get_host_config_path(spec)
+        cfg = open(host_config_path, "w")
+        cfg.write("###################\n".format("#" * 60))
+        cfg.write("# Generated host-config - Edit at own risk!\n")
+        cfg.write("###################\n".format("#" * 60))
+        cfg.write("# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC and\n")
+        cfg.write("# other RAJAPerf contributors. See the top-level LICENSE file for\n")
+        cfg.write("# details.\n")
+        cfg.write("#\n")
+        cfg.write("# SPDX-License-Identifier: (BSD-3-Clause) \n")
+        cfg.write("###################\n\n".format("#" * 60))
+
+        cfg.write("#------------------\n".format("-" * 60))
+        cfg.write("# SYS_TYPE: {0}\n".format(sys_type))
+        cfg.write("# Compiler Spec: {0}\n".format(spec.compiler))
+        cfg.write("# CMake executable path: %s\n" % cmake_exe)
+        cfg.write("#------------------\n\n".format("-" * 60))
+
+        cfg.write(cmake_cache_string("CMAKE_BUILD_TYPE", spec.variants['build_type'].value))
+
+        #######################
+        # Compiler Settings
+        #######################
+
+        cfg.write("#------------------\n".format("-" * 60))
+        cfg.write("# Compilers\n")
+        cfg.write("#------------------\n\n".format("-" * 60))
+        cfg.write(cmake_cache_entry("CMAKE_C_COMPILER", c_compiler))
+        cfg.write(cmake_cache_entry("CMAKE_CXX_COMPILER", cpp_compiler))
+
+        # use global spack compiler flags
+        cflags = ' '.join(spec.compiler_flags['cflags'])
+        if "+libcpp" in spec:
+            cflags += ' '.join([cflags,"-DGTEST_HAS_CXXABI_H_=0"])
+        if cflags:
+            cfg.write(cmake_cache_entry("CMAKE_C_FLAGS", cflags))
+
+        cxxflags = ' '.join(spec.compiler_flags['cxxflags'])
+        if "+libcpp" in spec:
+            cxxflags += ' '.join([cxxflags,"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0"])
+        if cxxflags:
+            cfg.write(cmake_cache_entry("CMAKE_CXX_FLAGS", cxxflags))
+
+        if ("gfortran" in f_compiler) and ("clang" in cpp_compiler):
+            libdir = pjoin(os.path.dirname(
+                           os.path.dirname(f_compiler)), "lib")
+            flags = ""
+            for _libpath in [libdir, libdir + "64"]:
+                if os.path.exists(_libpath):
+                    flags += " -Wl,-rpath,{0}".format(_libpath)
+            description = ("Adds a missing libstdc++ rpath")
+            #if flags:
+            #    cfg.write(cmake_cache_string("BLT_EXE_LINKER_FLAGS", flags,
+            #                                description))
+
+        gcc_toolchain_regex = re.compile("--gcc-toolchain=(.*)")
+        gcc_name_regex = re.compile(".*gcc-name.*")
+
+        using_toolchain = list(filter(gcc_toolchain_regex.match, spec.compiler_flags['cxxflags']))
+        if(using_toolchain):
+          gcc_toolchain_path = gcc_toolchain_regex.match(using_toolchain[0])
+        using_gcc_name = list(filter(gcc_name_regex.match, spec.compiler_flags['cxxflags']))
+        compilers_using_toolchain = ["pgi", "xl", "icpc"]
+        if any(compiler in cpp_compiler for compiler in compilers_using_toolchain):
+            if using_toolchain or using_gcc_name:
+                cfg.write(cmake_cache_entry("BLT_CMAKE_IMPLICIT_LINK_DIRECTORIES_EXCLUDE",
+                "/usr/tce/packages/gcc/gcc-4.9.3/lib64;/usr/tce/packages/gcc/gcc-4.9.3/gnu/lib64/gcc/powerpc64le-unknown-linux-gnu/4.9.3;/usr/tce/packages/gcc/gcc-4.9.3/gnu/lib64;/usr/tce/packages/gcc/gcc-4.9.3/lib64/gcc/x86_64-unknown-linux-gnu/4.9.3"))
+
+        compilers_using_cxx14 = ["intel-17", "intel-18", "xl"]
+        if any(compiler in cpp_compiler for compiler in compilers_using_cxx14):
+            cfg.write(cmake_cache_entry("BLT_CXX_STD", "c++14"))
+
+        if "+cuda" in spec:
+            cfg.write("#------------------{0}\n".format("-" * 60))
+            cfg.write("# Cuda\n")
+            cfg.write("#------------------{0}\n\n".format("-" * 60))
+
+            cfg.write(cmake_cache_option("ENABLE_CUDA", True))
+
+            cudatoolkitdir = spec['cuda'].prefix
+            cfg.write(cmake_cache_entry("CUDA_TOOLKIT_ROOT_DIR",
+                                        cudatoolkitdir))
+            cudacompiler = "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc"
+            cfg.write(cmake_cache_entry("CMAKE_CUDA_COMPILER",
+                                        cudacompiler))
+
+            cfg.write(cmake_cache_string("BLT_CXX_STD", "c++14"))
+            cfg.write(cmake_cache_option("ENABLE_TESTS", not 'tests=none' in spec or self.run_tests))
+
+            if ("xl" in cpp_compiler):
+                cfg.write(cmake_cache_entry("CMAKE_CUDA_FLAGS", "-Xcompiler -O2 -Xcompiler -qstrict " +
+                                            "-Xcompiler -qxlcompatmacros -Xcompiler -qalias=noansi " + 
+                                            "-Xcompiler -qsmp=omp -Xcompiler -qhot -Xcompiler -qnoeh -Xcompiler -qsuppress=1500-029 " +
+                                            "-Xcompiler -qsuppress=1500-036 -Xcompiler -qsuppress=1500-030"))
+                cuda_release_flags = "-O3"
+                cuda_reldebinf_flags = "-O3 -g"
+                cuda_debug_flags = "-O0 -g"
+
+            elif ("gcc" in cpp_compiler):
+                cuda_release_flags = "-O3 -Xcompiler -Ofast -Xcompiler -finline-functions -Xcompiler -finline-limit=20000"
+                cuda_reldebinf_flags = "-O3 -g -Xcompiler -Ofast -Xcompiler -finline-functions -Xcompiler -finline-limit=20000"
+                cuda_debug_flags = "-O0 -g -Xcompiler -O0 -Xcompiler -finline-functions -Xcompiler -finline-limit=20000"
+            
+            else:
+                cuda_release_flags = "-O3 -Xcompiler -Ofast -Xcompiler -finline-functions"
+                cuda_reldebinf_flags = "-O3 -g -Xcompiler -Ofast -Xcompiler -finline-functions"
+                cuda_debug_flags = "-O0 -g -Xcompiler -O0 -Xcompiler -finline-functions"
+                
+            cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELEASE", cuda_release_flags))
+            cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELWITHDEBINFO", cuda_reldebinf_flags))
+            cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_DEBUG", cuda_debug_flags))
+
+            if not spec.satisfies('cuda_arch=none'):
+                cuda_arch = spec.variants['cuda_arch'].value
+                cfg.write(cmake_cache_string("CUDA_ARCH", 'sm_{0}'.format(cuda_arch[0])))
+
+        else:
+            cfg.write(cmake_cache_option("ENABLE_CUDA", False))
+
+        if "+rocm" in spec:
+            cfg.write("#------------------{0}\n".format("-" * 60))
+            cfg.write("# HIP\n")
+            cfg.write("#------------------{0}\n\n".format("-" * 60))
+
+            cfg.write(cmake_cache_option("ENABLE_HIP", True))
+            cfg.write(cmake_cache_option("ENABLE_TESTS", not 'tests=none' in spec or self.run_tests))
+
+            hip_root = spec['hip'].prefix
+            rocm_root = hip_root + "/.."
+            cfg.write(cmake_cache_entry("HIP_ROOT_DIR",
+                                        hip_root))
+            cfg.write(cmake_cache_entry("ROCM_ROOT_DIR",
+                                        rocm_root))
+            cfg.write(cmake_cache_entry("HIP_PATH",
+                                        rocm_root + '/llvm/bin'))
+            cfg.write(cmake_cache_entry("CMAKE_HIP_ARCHITECTURES", 'gfx906'))
+
+            hipcc_flags = ['--amdgpu-target=gfx906']
+
+            cfg.write(cmake_cache_entry("HIP_HIPCC_FLAGS", ';'.join(hipcc_flags)))
+
+            #cfg.write(cmake_cache_entry("HIP_RUNTIME_INCLUDE_DIRS",
+            #                            "{0}/include;{0}/../hsa/include".format(hip_root)))
+            #hip_link_flags = "-Wl,--disable-new-dtags -L{0}/lib -L{0}/../lib64 -L{0}/../lib -Wl,-rpath,{0}/lib:{0}/../lib:{0}/../lib64 -lamdhip64 -lhsakmt -lhsa-runtime64".format(hip_root)
+            if ('%gcc' in spec) or (using_toolchain):
+                if ('%gcc' in spec):
+                    gcc_bin = os.path.dirname(self.compiler.cxx)
+                    gcc_prefix = join_path(gcc_bin, '..')
+                else:
+                    gcc_prefix = gcc_toolchain_path.group(1)
+                cfg.write(cmake_cache_entry("HIP_CLANG_FLAGS",
+                "--gcc-toolchain={0}".format(gcc_prefix))) 
+                cfg.write(cmake_cache_entry("CMAKE_EXE_LINKER_FLAGS",
+                " -Wl,-rpath {}/lib64".format(gcc_prefix)))
+            #else:
+            #    cfg.write(cmake_cache_entry("CMAKE_EXE_LINKER_FLAGS", hip_link_flags))
+
+        else:
+            cfg.write(cmake_cache_option("ENABLE_HIP", False))
+
+        cfg.write(cmake_cache_option("ENABLE_OPENMP_TARGET", "+openmp_target" in spec))
+        if "+openmp_target" in spec:
+            if ('%xl' in spec):
+                cfg.write(cmake_cache_string("OpenMP_CXX_FLAGS", "-qsmp=omp;-qoffload;-qnoeh;-qalias=noansi"))
+            if ('%clang' in spec):
+                cfg.write(cmake_cache_string("OpenMP_CXX_FLAGS", "-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda"))
+                cfg.write(cmake_cache_option("ENABLE_CUDA", False))
+
+
+        cfg.write("#------------------{0}\n".format("-" * 60))
+        cfg.write("# Other\n")
+        cfg.write("#------------------{0}\n\n".format("-" * 60))
+
+        cfg.write(cmake_cache_string("RAJA_RANGE_ALIGN", "4"))
+        cfg.write(cmake_cache_string("RAJA_RANGE_MIN_LENGTH", "32"))
+        cfg.write(cmake_cache_string("RAJA_DATA_ALIGN", "64"))
+
+        cfg.write(cmake_cache_option("RAJA_HOST_CONFIG_LOADED", True))
+
+        # shared vs static libs
+        cfg.write(cmake_cache_option("BUILD_SHARED_LIBS","+shared" in spec))
+        cfg.write(cmake_cache_option("ENABLE_OPENMP","+openmp" in spec))
+
+        cfg.write(cmake_cache_option("ENABLE_BENCHMARKS", 'tests=benchmarks' in spec))
+        cfg.write(cmake_cache_option("ENABLE_TESTS", not 'tests=none' in spec or self.run_tests))
+
+        #######################
+        # Close and save
+        #######################
+        cfg.write("\n")
+        cfg.close()
+
+        print("OUT: host-config file {0}".format(host_config_path))
+
+    def cmake_args(self):
+        spec = self.spec
+        host_config_path = self._get_host_config_path(spec)
+
+        options = []
+        options.extend(['-C', host_config_path])
+
+        return options
diff --git a/scripts/sweep_size.sh b/scripts/sweep_size.sh
index 20fe6bdfd..65c3a94e5 100755
--- a/scripts/sweep_size.sh
+++ b/scripts/sweep_size.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 EXECUTABLES=""
 SIZE_MIN=10000
@@ -8,18 +8,18 @@ SIZE_RATIO=2
 ################################################################################
 #
 # Usage:
-#     srun -n1 --exclusive sweep.sh -x raja-perf.exe [-- raja perf args]
+#     srun -n1 --exclusive sweep.sh -x raja-perf.exe [-- <raja perf args>]
 #
 # Parse any args for this script and consume them using shift
 # leave the raja perf arguments if any for later use
 #
 # Examples:
-#     lalloc 1 lrun -n1 sweep.sh -x raja-perf.exe -- args
+#     lalloc 1 lrun -n1 sweep.sh -x raja-perf.exe -- <args>
 #       # run a sweep of default problem sizes with executable `raja-perf.exe`
 #       # with args `args`
 #
 #     srun -n1 --exclusive sweep.sh -x raja-perf.exe --size-min 1000
-#            --size-max 10000 --size-ratio 2 -- args
+#            --size-max 10000 --size-ratio 2 -- <args>
 #       # run a sweep of problem sizes 1K to 10K with ratio 2 (1K, 2K, 4K, 8K)
 #       # with executable `raja-perf.exe` with args `args`
 #
diff --git a/scripts/travis_build_and_test.sh b/scripts/travis_build_and_test.sh
index 290cdff13..d53c88550 100755
--- a/scripts/travis_build_and_test.sh
+++ b/scripts/travis_build_and_test.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors. 
 # See the RAJAPerf/LICENSE file for details.
 #
diff --git a/scripts/ubuntu-builds/ubuntu_clang.sh b/scripts/ubuntu-builds/ubuntu_clang.sh
index 7001ddd39..664685f42 100755
--- a/scripts/ubuntu-builds/ubuntu_clang.sh
+++ b/scripts/ubuntu-builds/ubuntu_clang.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
diff --git a/scripts/ubuntu-builds/ubuntu_gcc.sh b/scripts/ubuntu-builds/ubuntu_gcc.sh
index 8f7fc8a16..f0eb7fcf7 100755
--- a/scripts/ubuntu-builds/ubuntu_gcc.sh
+++ b/scripts/ubuntu-builds/ubuntu_gcc.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
diff --git a/scripts/update_copyright.sh b/scripts/update_copyright.sh
index 422f73ea7..31a658efa 100755
--- a/scripts/update_copyright.sh
+++ b/scripts/update_copyright.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors. 
 # See the RAJAPerf/LICENSE file for details.
 #
@@ -10,7 +10,7 @@
 
 #=============================================================================
 # Change the copyright date in all files that contain the text
-# "the RAJAPerf/COPYRIGHT file", which is part of the copyright statement
+# "the RAJAPerf/LICENSE file", which is part of the copyright statement
 # at the top of each RAJA file. We use this to distinguish RAJA files from
 # that we do not own (e.g., other repos included as submodules), which we do
 # not want to modify. Note that this file and *.git files are omitted
@@ -37,7 +37,7 @@
 #=============================================================================
 # First find all the files we want to modify
 #=============================================================================
-find . -type f ! -name \*.git\*  ! -name \*update_copyright\* -exec grep -l "the RAJAPerf/COPYRIGHT file" {} \; > files2change
+find . -type f ! -name \*.git\*  ! -name \*update_copyright\* -exec grep -l "the RAJAPerf/LICENSE file" {} \; > files2change
 
 #=============================================================================
 # Replace the old copyright dates with new dates
@@ -46,14 +46,14 @@ for i in `cat files2change`
 do
     echo $i
     cp $i $i.sed.bak
-    sed "s/Copyright (c) 2017-20/Copyright (c) 2017-21/" $i.sed.bak > $i
+    sed "s/Copyright (c) 2017-21/Copyright (c) 2017-22/" $i.sed.bak > $i
 done
 
 for i in LICENSE RELEASE README.md
 do
     echo $i
     cp $i $i.sed.bak
-    sed "s/2017-2020/2017-2021/" $i.sed.bak > $i
+    sed "s/2017-2021/2017-2022/" $i.sed.bak > $i
 done
 
 #=============================================================================
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 877bf5306..bc1bf6b77 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
@@ -36,6 +36,9 @@ blt_add_executable(
   apps/DEL_DOT_VEC_2D.cpp
   apps/DEL_DOT_VEC_2D-Seq.cpp
   apps/DEL_DOT_VEC_2D-OMPTarget.cpp
+  apps/DIFFUSION3DPA.cpp
+  apps/DIFFUSION3DPA-Seq.cpp
+  apps/DIFFUSION3DPA-OMPTarget.cpp
   apps/ENERGY.cpp
   apps/ENERGY-Seq.cpp
   apps/ENERGY-OMPTarget.cpp
@@ -60,6 +63,9 @@ blt_add_executable(
   apps/MASS3DPA.cpp
   apps/MASS3DPA-Seq.cpp
   apps/MASS3DPA-OMPTarget.cpp
+  apps/NODAL_ACCUMULATION_3D.cpp
+  apps/NODAL_ACCUMULATION_3D-Seq.cpp
+  apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp
   apps/VOL3D.cpp
   apps/VOL3D-Seq.cpp
   apps/VOL3D-OMPTarget.cpp
@@ -67,9 +73,18 @@ blt_add_executable(
   basic/DAXPY.cpp
   basic/DAXPY-Seq.cpp
   basic/DAXPY-OMPTarget.cpp
+  basic/DAXPY_ATOMIC.cpp
+  basic/DAXPY_ATOMIC-Seq.cpp
+  basic/DAXPY_ATOMIC-OMPTarget.cpp
   basic/IF_QUAD.cpp
   basic/IF_QUAD-Seq.cpp
   basic/IF_QUAD-OMPTarget.cpp
+  basic/INDEXLIST.cpp
+  basic/INDEXLIST-Seq.cpp
+  basic/INDEXLIST-OMPTarget.cpp
+  basic/INDEXLIST_3LOOP.cpp
+  basic/INDEXLIST_3LOOP-Seq.cpp
+  basic/INDEXLIST_3LOOP-OMPTarget.cpp
   basic/INIT3.cpp
   basic/INIT3-Seq.cpp
   basic/INIT3-OMPTarget.cpp
@@ -97,6 +112,9 @@ blt_add_executable(
   basic/REDUCE3_INT.cpp
   basic/REDUCE3_INT-Seq.cpp
   basic/REDUCE3_INT-OMPTarget.cpp
+  basic/REDUCE_STRUCT.cpp
+  basic/REDUCE_STRUCT-Seq.cpp
+  basic/REDUCE_STRUCT-OMPTarget.cpp
   basic/TRAP_INT.cpp
   basic/TRAP_INT-Seq.cpp
   basic/TRAP_INT-OMPTarget.cpp
@@ -194,10 +212,16 @@ blt_add_executable(
   common/RAJAPerfSuite.cpp
   common/RPTypes.hpp
   common/RunParams.cpp
+  algorithm/SCAN.cpp
+  algorithm/SCAN-Seq.cpp
+  algorithm/SCAN-OMPTarget.cpp
   algorithm/SORT.cpp
   algorithm/SORT-Seq.cpp
   algorithm/SORTPAIRS.cpp
   algorithm/SORTPAIRS-Seq.cpp
+  algorithm/REDUCE_SUM.cpp
+  algorithm/REDUCE_SUM-Seq.cpp
+  algorithm/REDUCE_SUM-OMPTarget.cpp
   DEPENDS_ON ${RAJA_PERFSUITE_DEPENDS}
 )
 
diff --git a/src/RAJAPerfSuiteDriver.cpp b/src/RAJAPerfSuiteDriver.cpp
index c47ecd9f1..d423dcff9 100644
--- a/src/RAJAPerfSuiteDriver.cpp
+++ b/src/RAJAPerfSuiteDriver.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -10,18 +10,30 @@
 
 #include <iostream>
 
+#ifdef RAJA_PERFSUITE_ENABLE_MPI
+#include <mpi.h>
+#endif
+
 //------------------------------------------------------------------------------
 int main( int argc, char** argv )
 {
+#ifdef RAJA_PERFSUITE_ENABLE_MPI
+  MPI_Init(&argc, &argv);
+
+  int num_ranks;
+  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);
+  rajaperf::getCout() << "\n\nRunning with " << num_ranks << " MPI ranks..." << std::endl;
+#endif
+
   // STEP 1: Create suite executor object
   rajaperf::Executor executor(argc, argv);
 
   // STEP 2: Assemble kernels and variants to run
   executor.setupSuite();
 
-  // STEP 3: Report suite run summary 
+  // STEP 3: Report suite run summary
   //         (enable users to catch errors before entire suite is run)
-  executor.reportRunSummary(std::cout); 
+  executor.reportRunSummary(rajaperf::getCout());
 
   // STEP 4: Execute suite
   executor.runSuite();
@@ -29,7 +41,11 @@ int main( int argc, char** argv )
   // STEP 5: Generate suite execution reports
   executor.outputRunData();
 
-  std::cout << "\n\nDONE!!!...." << std::endl; 
+  rajaperf::getCout() << "\n\nDONE!!!...." << std::endl;
+
+#ifdef RAJA_PERFSUITE_ENABLE_MPI
+  MPI_Finalize();
+#endif
 
   return 0;
 }
diff --git a/src/algorithm/CMakeLists.txt b/src/algorithm/CMakeLists.txt
index d328a5902..ec0fcbf74 100644
--- a/src/algorithm/CMakeLists.txt
+++ b/src/algorithm/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
@@ -8,7 +8,12 @@
 
 blt_add_library(
   NAME algorithm
-  SOURCES SORT.cpp
+  SOURCES SCAN.cpp
+          SCAN-Seq.cpp
+          SCAN-Hip.cpp
+          SCAN-Cuda.cpp
+          SCAN-OMP.cpp
+          SORT.cpp
           SORT-Seq.cpp
           SORT-Hip.cpp
           SORT-Cuda.cpp
@@ -18,5 +23,11 @@ blt_add_library(
           SORTPAIRS-Hip.cpp
           SORTPAIRS-Cuda.cpp
           SORTPAIRS-OMP.cpp
+          REDUCE_SUM.cpp
+          REDUCE_SUM-Seq.cpp
+          REDUCE_SUM-Hip.cpp
+          REDUCE_SUM-Cuda.cpp
+          REDUCE_SUM-OMP.cpp
+          REDUCE_SUM-OMPTarget.cpp
   DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS}
   )
diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp
new file mode 100644
index 000000000..c8b4bb8e4
--- /dev/null
+++ b/src/algorithm/REDUCE_SUM-Cuda.cpp
@@ -0,0 +1,303 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "REDUCE_SUM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include "cub/device/device_reduce.cuh"
+#include "cub/util_allocator.cuh"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+#define REDUCE_SUM_DATA_SETUP_CUDA \
+  allocAndInitCudaDeviceData(x, m_x, iend);
+
+#define REDUCE_SUM_DATA_TEARDOWN_CUDA \
+  deallocCudaDeviceData(x);
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void reduce_sum(Real_ptr x, Real_ptr dsum, Real_type sum_init,
+                           Index_type iend)
+{
+  extern __shared__ Real_type psum[ ];
+
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
+
+  psum[ threadIdx.x ] = sum_init;
+  for ( ; i < iend ; i += gridDim.x * block_size ) {
+    psum[ threadIdx.x ] += x[i];
+  }
+  __syncthreads();
+
+  for ( i = block_size / 2; i > 0; i /= 2 ) {
+    if ( threadIdx.x < i ) {
+      psum[ threadIdx.x ] += psum[ threadIdx.x + i ];
+    }
+     __syncthreads();
+  }
+
+#if 1 // serialized access to shared data;
+  if ( threadIdx.x == 0 ) {
+    RAJA::atomicAdd<RAJA::cuda_atomic>( dsum, psum[ 0 ] );
+  }
+#else // this doesn't work due to data races
+  if ( threadIdx.x == 0 ) {
+    *dsum += psum[ 0 ];
+  }
+#endif
+}
+
+
+void REDUCE_SUM::runCudaVariantCub(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  REDUCE_SUM_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    REDUCE_SUM_DATA_SETUP_CUDA;
+
+    cudaStream_t stream = 0;
+
+    int len = iend - ibegin;
+
+    Real_type* sum_storage;
+    allocCudaPinnedData(sum_storage, 1);
+
+    // Determine temporary device storage requirements
+    void* d_temp_storage = nullptr;
+    size_t temp_storage_bytes = 0;
+    cudaErrchk(::cub::DeviceReduce::Reduce(d_temp_storage,
+                                           temp_storage_bytes,
+                                           x+ibegin,
+                                           sum_storage,
+                                           len,
+                                           ::cub::Sum(),
+                                           m_sum_init,
+                                           stream));
+
+    // Allocate temporary storage
+    unsigned char* temp_storage;
+    allocCudaDeviceData(temp_storage, temp_storage_bytes);
+    d_temp_storage = temp_storage;
+
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      // Run
+      cudaErrchk(::cub::DeviceReduce::Reduce(d_temp_storage,
+                                             temp_storage_bytes,
+                                             x+ibegin,
+                                             sum_storage,
+                                             len,
+                                             ::cub::Sum(),
+                                             m_sum_init,
+                                             stream));
+
+      cudaErrchk(cudaStreamSynchronize(stream));
+      m_sum = *sum_storage;
+
+    }
+    stopTimer();
+
+    // Free temporary storage
+    deallocCudaDeviceData(temp_storage);
+    deallocCudaPinnedData(sum_storage);
+
+    REDUCE_SUM_DATA_TEARDOWN_CUDA;
+
+  } else {
+
+    getCout() << "\n  REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl;
+
+  }
+
+}
+
+template < size_t block_size >
+void REDUCE_SUM::runCudaVariantBlock(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  REDUCE_SUM_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    REDUCE_SUM_DATA_SETUP_CUDA;
+
+    Real_ptr dsum;
+    allocCudaDeviceData(dsum, 1);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      initCudaDeviceData(dsum, &m_sum_init, 1);
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      reduce_sum<block_size><<<grid_size, block_size,
+                  sizeof(Real_type)*block_size>>>( x,
+                                                   dsum, m_sum_init,
+                                                   iend );
+      cudaErrchk( cudaGetLastError() );
+
+      Real_type lsum;
+      Real_ptr plsum = &lsum;
+      getCudaDeviceData(plsum, dsum, 1);
+
+      m_sum = lsum;
+
+    }
+    stopTimer();
+
+    deallocCudaDeviceData(dsum);
+
+    REDUCE_SUM_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    REDUCE_SUM_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::ReduceSum<RAJA::cuda_reduce, Real_type> sum(m_sum_init);
+
+      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+        RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+          REDUCE_SUM_BODY;
+      });
+
+      m_sum = sum.get();
+
+    }
+    stopTimer();
+
+    REDUCE_SUM_DATA_TEARDOWN_CUDA;
+
+  } else {
+
+    getCout() << "\n  REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx)
+{
+  if ( vid == Base_CUDA ) {
+
+    size_t t = 0;
+
+    if (tune_idx == t) {
+
+      runCudaVariantCub(vid);
+
+    }
+
+    t += 1;
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        if (tune_idx == t) {
+
+          runCudaVariantBlock<block_size>(vid);
+
+        }
+
+        t += 1;
+
+      }
+
+    });
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    size_t t = 0;
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        if (tune_idx == t) {
+
+          runCudaVariantBlock<block_size>(vid);
+
+        }
+
+        t += 1;
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void REDUCE_SUM::setCudaTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_CUDA ) {
+
+    addVariantTuningName(vid, "cub");
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        addVariantTuningName(vid, "block_"+std::to_string(block_size));
+
+      }
+
+    });
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        addVariantTuningName(vid, "block_"+std::to_string(block_size));
+
+      }
+
+    });
+
+  }
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp
new file mode 100644
index 000000000..691db7fae
--- /dev/null
+++ b/src/algorithm/REDUCE_SUM-Hip.cpp
@@ -0,0 +1,334 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "REDUCE_SUM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#if defined(__HIPCC__)
+#define ROCPRIM_HIP_API 1
+#include "rocprim/device/device_reduce.hpp"
+#elif defined(__CUDACC__)
+#include "cub/device/device_reduce.cuh"
+#include "cub/util_allocator.cuh"
+#endif
+
+#include "common/HipDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+#define REDUCE_SUM_DATA_SETUP_HIP \
+  allocAndInitHipDeviceData(x, m_x, iend);
+
+#define REDUCE_SUM_DATA_TEARDOWN_HIP \
+  deallocHipDeviceData(x);
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void reduce_sum(Real_ptr x, Real_ptr dsum, Real_type sum_init,
+                           Index_type iend)
+{
+  HIP_DYNAMIC_SHARED(Real_type, psum);
+
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
+
+  psum[ threadIdx.x ] = sum_init;
+  for ( ; i < iend ; i += gridDim.x * block_size ) {
+    psum[ threadIdx.x ] += x[i];
+  }
+  __syncthreads();
+
+  for ( i = block_size / 2; i > 0; i /= 2 ) {
+    if ( threadIdx.x < i ) {
+      psum[ threadIdx.x ] += psum[ threadIdx.x + i ];
+    }
+     __syncthreads();
+  }
+
+#if 1 // serialized access to shared data;
+  if ( threadIdx.x == 0 ) {
+    RAJA::atomicAdd<RAJA::hip_atomic>( dsum, psum[ 0 ] );
+  }
+#else // this doesn't work due to data races
+  if ( threadIdx.x == 0 ) {
+    *dsum += psum[ 0 ];
+  }
+#endif
+}
+
+
+void REDUCE_SUM::runHipVariantRocprim(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  REDUCE_SUM_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    REDUCE_SUM_DATA_SETUP_HIP;
+
+    hipStream_t stream = 0;
+
+    int len = iend - ibegin;
+
+    Real_type* sum_storage;
+    allocHipPinnedData(sum_storage, 1);
+
+    // Determine temporary device storage requirements
+    void* d_temp_storage = nullptr;
+    size_t temp_storage_bytes = 0;
+#if defined(__HIPCC__)
+    hipErrchk(::rocprim::reduce(d_temp_storage,
+                                temp_storage_bytes,
+                                x+ibegin,
+                                sum_storage,
+                                m_sum_init,
+                                len,
+                                rocprim::plus<Real_type>(),
+                                stream));
+#elif defined(__CUDACC__)
+    hipErrchk(::cub::DeviceReduce::Reduce(d_temp_storage,
+                                          temp_storage_bytes,
+                                          x+ibegin,
+                                          sum_storage,
+                                          len,
+                                          ::cub::Sum(),
+                                          m_sum_init,
+                                          stream));
+#endif
+
+    // Allocate temporary storage
+    unsigned char* temp_storage;
+    allocHipDeviceData(temp_storage, temp_storage_bytes);
+    d_temp_storage = temp_storage;
+
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      // Run
+#if defined(__HIPCC__)
+      hipErrchk(::rocprim::reduce(d_temp_storage,
+                                  temp_storage_bytes,
+                                  x+ibegin,
+                                  sum_storage,
+                                  m_sum_init,
+                                  len,
+                                  rocprim::plus<Real_type>(),
+                                  stream));
+#elif defined(__CUDACC__)
+      hipErrchk(::cub::DeviceReduce::Reduce(d_temp_storage,
+                                            temp_storage_bytes,
+                                            x+ibegin,
+                                            sum_storage,
+                                            len,
+                                            ::cub::Sum(),
+                                            m_sum_init,
+                                            stream));
+#endif
+
+      hipErrchk(hipStreamSynchronize(stream));
+      m_sum = *sum_storage;
+
+    }
+    stopTimer();
+
+    // Free temporary storage
+    deallocHipDeviceData(temp_storage);
+    deallocHipPinnedData(sum_storage);
+
+    REDUCE_SUM_DATA_TEARDOWN_HIP;
+
+  } else {
+
+    getCout() << "\n  REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl;
+
+  }
+
+}
+
+template < size_t block_size >
+void REDUCE_SUM::runHipVariantBlock(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  REDUCE_SUM_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    REDUCE_SUM_DATA_SETUP_HIP;
+
+    Real_ptr dsum;
+    allocHipDeviceData(dsum, 1);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      initHipDeviceData(dsum, &m_sum_init, 1);
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      hipLaunchKernelGGL( (reduce_sum<block_size>), dim3(grid_size), dim3(block_size),
+                          sizeof(Real_type)*block_size, 0,
+                          x, dsum, m_sum_init, iend );
+      hipErrchk( hipGetLastError() );
+
+      Real_type lsum;
+      Real_ptr plsum = &lsum;
+      getHipDeviceData(plsum, dsum, 1);
+
+      m_sum = lsum;
+
+    }
+    stopTimer();
+
+    deallocHipDeviceData(dsum);
+
+    REDUCE_SUM_DATA_TEARDOWN_HIP;
+
+  } else if ( vid == RAJA_HIP ) {
+
+    REDUCE_SUM_DATA_SETUP_HIP;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::ReduceSum<RAJA::hip_reduce, Real_type> sum(m_sum_init);
+
+      RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >(
+        RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+          REDUCE_SUM_BODY;
+      });
+
+      m_sum = sum.get();
+
+    }
+    stopTimer();
+
+    REDUCE_SUM_DATA_TEARDOWN_HIP;
+
+  } else {
+
+    getCout() << "\n  REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx)
+{
+  if ( vid == Base_HIP ) {
+
+    size_t t = 0;
+
+    if (tune_idx == t) {
+
+      runHipVariantRocprim(vid);
+
+    }
+
+    t += 1;
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        if (tune_idx == t) {
+
+          runHipVariantBlock<block_size>(vid);
+
+        }
+
+        t += 1;
+
+      }
+
+    });
+
+  } else if ( vid == RAJA_HIP ) {
+
+    size_t t = 0;
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        if (tune_idx == t) {
+
+          runHipVariantBlock<block_size>(vid);
+
+        }
+
+        t += 1;
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void REDUCE_SUM::setHipTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_HIP ) {
+
+#if defined(__HIPCC__)
+    addVariantTuningName(vid, "rocprim");
+#elif defined(__CUDACC__)
+    addVariantTuningName(vid, "cub");
+#endif
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        addVariantTuningName(vid, "block_"+std::to_string(block_size));
+
+      }
+
+    });
+
+  } else if ( vid == RAJA_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        addVariantTuningName(vid, "block_"+std::to_string(block_size));
+
+      }
+
+    });
+
+  }
+
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_HIP
diff --git a/src/algorithm/REDUCE_SUM-OMP.cpp b/src/algorithm/REDUCE_SUM-OMP.cpp
new file mode 100644
index 000000000..8f80b5633
--- /dev/null
+++ b/src/algorithm/REDUCE_SUM-OMP.cpp
@@ -0,0 +1,110 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "REDUCE_SUM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+
+void REDUCE_SUM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  REDUCE_SUM_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_OpenMP : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        Real_type sum = m_sum_init;
+
+        #pragma omp parallel for reduction(+:sum)
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          REDUCE_SUM_BODY;
+        }
+
+        m_sum = sum;
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case Lambda_OpenMP : {
+
+      auto sumreduce_base_lam = [=](Index_type i) {
+                                 return x[i];
+                               };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        Real_type sum = m_sum_init;
+
+        #pragma omp parallel for reduction(+:sum)
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          sum += sumreduce_base_lam(i);
+        }
+
+        m_sum = sum;
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_OpenMP : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::ReduceSum<RAJA::omp_reduce, Real_type> sum(m_sum_init);
+
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend),
+          [=](Index_type i) {
+            REDUCE_SUM_BODY;
+        });
+
+        m_sum = sum.get();
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    default : {
+      getCout() << "\n  REDUCE_SUM : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
diff --git a/src/algorithm/REDUCE_SUM-OMPTarget.cpp b/src/algorithm/REDUCE_SUM-OMPTarget.cpp
new file mode 100644
index 000000000..b3bf8ac05
--- /dev/null
+++ b/src/algorithm/REDUCE_SUM-OMPTarget.cpp
@@ -0,0 +1,101 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "REDUCE_SUM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+  //
+  // Define threads per team for target execution
+  //
+  const size_t threads_per_team = 256;
+
+#define REDUCE_SUM_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid);
+
+#define REDUCE_SUM_DATA_TEARDOWN_OMP_TARGET \
+  deallocOpenMPDeviceData(x, did); \
+
+
+void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  REDUCE_SUM_DATA_SETUP;
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    REDUCE_SUM_DATA_SETUP_OMP_TARGET
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Real_type sum = m_sum_init;
+
+      #pragma omp target is_device_ptr(x) device( did ) map(tofrom:sum)
+      #pragma omp teams distribute parallel for reduction(+:sum) \
+              thread_limit(threads_per_team) schedule(static, 1)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        REDUCE_SUM_BODY;
+      }
+
+      m_sum = sum;
+
+    }
+    stopTimer();
+
+    REDUCE_SUM_DATA_TEARDOWN_OMP_TARGET
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    REDUCE_SUM_DATA_SETUP_OMP_TARGET
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::ReduceSum<RAJA::omp_target_reduce, Real_type> sum(m_sum_init);
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+        RAJA::RangeSegment(ibegin, iend),
+        [=](Index_type i) {
+          REDUCE_SUM_BODY;
+      });
+
+      m_sum = sum.get();
+
+    }
+    stopTimer();
+
+    REDUCE_SUM_DATA_TEARDOWN_OMP_TARGET
+
+  } else {
+    getCout() << "\n  REDUCE_SUM : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/algorithm/REDUCE_SUM-Seq.cpp b/src/algorithm/REDUCE_SUM-Seq.cpp
new file mode 100644
index 000000000..d4fc7cddf
--- /dev/null
+++ b/src/algorithm/REDUCE_SUM-Seq.cpp
@@ -0,0 +1,104 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "REDUCE_SUM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+
+void REDUCE_SUM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  REDUCE_SUM_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        Real_type sum = m_sum_init;
+
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          REDUCE_SUM_BODY;
+        }
+
+        m_sum = sum;
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+#if defined(RUN_RAJA_SEQ)
+    case Lambda_Seq : {
+
+      auto reduce_sum_base_lam = [=](Index_type i) {
+                                 return x[i];
+                               };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        Real_type sum = m_sum_init;
+
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          sum += reduce_sum_base_lam(i);
+        }
+
+        m_sum = sum;
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::ReduceSum<RAJA::seq_reduce, Real_type> sum(m_sum_init);
+
+        RAJA::forall<RAJA::loop_exec>( RAJA::RangeSegment(ibegin, iend),
+          [=](Index_type i) {
+            REDUCE_SUM_BODY;
+        });
+
+        m_sum = sum.get();
+
+      }
+      stopTimer();
+
+      break;
+    }
+#endif
+
+    default : {
+      getCout() << "\n  REDUCE_SUM : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
diff --git a/src/algorithm/REDUCE_SUM.cpp b/src/algorithm/REDUCE_SUM.cpp
new file mode 100644
index 000000000..f85b982f6
--- /dev/null
+++ b/src/algorithm/REDUCE_SUM.cpp
@@ -0,0 +1,79 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "REDUCE_SUM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "common/DataUtils.hpp"
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+
+REDUCE_SUM::REDUCE_SUM(const RunParams& params)
+  : KernelBase(rajaperf::Algorithm_REDUCE_SUM, params)
+{
+  setDefaultProblemSize(1000000);
+  setDefaultReps(50);
+
+  setActualProblemSize( getTargetProblemSize() );
+
+  setItsPerRep( getActualProblemSize() );
+  setKernelsPerRep(1);
+  setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) +
+                  (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() );
+  setFLOPsPerRep(getActualProblemSize());
+
+  setUsesFeature(Forall);
+  setUsesFeature(Reduction);
+
+  setVariantDefined( Base_Seq );
+  setVariantDefined( Lambda_Seq );
+  setVariantDefined( RAJA_Seq );
+
+  setVariantDefined( Base_OpenMP );
+  setVariantDefined( Lambda_OpenMP );
+  setVariantDefined( RAJA_OpenMP );
+
+  setVariantDefined( Base_OpenMPTarget );
+  setVariantDefined( RAJA_OpenMPTarget );
+
+  setVariantDefined( Base_CUDA );
+  setVariantDefined( RAJA_CUDA );
+
+  setVariantDefined( Base_HIP );
+  setVariantDefined( RAJA_HIP );
+}
+
+REDUCE_SUM::~REDUCE_SUM()
+{
+}
+
+void REDUCE_SUM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  allocAndInitData(m_x, getActualProblemSize(), vid);
+  m_sum_init = 0.0;
+  m_sum = 0.0;
+}
+
+void REDUCE_SUM::updateChecksum(VariantID vid, size_t tune_idx)
+{
+  checksum[vid].at(tune_idx) += calcChecksum(&m_sum, 1);
+}
+
+void REDUCE_SUM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  (void) vid;
+  deallocData(m_x);
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
diff --git a/src/algorithm/REDUCE_SUM.hpp b/src/algorithm/REDUCE_SUM.hpp
new file mode 100644
index 000000000..f6dba52db
--- /dev/null
+++ b/src/algorithm/REDUCE_SUM.hpp
@@ -0,0 +1,83 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// REDUCE_SUM kernel reference implementation:
+///
+/// Real_type sum = std::reduce(x+ibegin, x+iend);
+/// // or
+/// Real_type sum = std::accumulate(x+ibegin, x+iend, 0.0);
+/// // or
+/// Real_type sum = 0.0;
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   sum += x[i] ;
+/// }
+///
+
+#ifndef RAJAPerf_Algorithm_REDUCE_SUM_HPP
+#define RAJAPerf_Algorithm_REDUCE_SUM_HPP
+
+#define REDUCE_SUM_DATA_SETUP \
+  Real_ptr x = m_x;
+
+#define REDUCE_SUM_STD_ARGS  \
+  x + ibegin, x + iend
+
+#define REDUCE_SUM_BODY \
+  sum += x[i];
+
+
+#include "common/KernelBase.hpp"
+
+namespace rajaperf
+{
+class RunParams;
+
+namespace algorithm
+{
+
+class REDUCE_SUM : public KernelBase
+{
+public:
+
+  REDUCE_SUM(const RunParams& params);
+
+  ~REDUCE_SUM();
+
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  void runCudaVariantCub(VariantID vid);
+  void runHipVariantRocprim(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantBlock(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantBlock(VariantID vid);
+
+private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
+  Real_ptr m_x;
+  Real_type m_sum_init;
+  Real_type m_sum;
+};
+
+} // end namespace algorithm
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard
diff --git a/src/algorithm/SCAN-Cuda.cpp b/src/algorithm/SCAN-Cuda.cpp
new file mode 100644
index 000000000..0f9612c23
--- /dev/null
+++ b/src/algorithm/SCAN-Cuda.cpp
@@ -0,0 +1,116 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "SCAN.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "cub/device/device_scan.cuh"
+#include "cub/util_allocator.cuh"
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+#define SCAN_DATA_SETUP_CUDA \
+  allocAndInitCudaDeviceData(x, m_x, iend); \
+  allocAndInitCudaDeviceData(y, m_y, iend);
+
+#define SCAN_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_y, y, iend); \
+  deallocCudaDeviceData(x); \
+  deallocCudaDeviceData(y);
+
+
+void SCAN::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  SCAN_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    SCAN_DATA_SETUP_CUDA;
+
+    cudaStream_t stream = 0;
+
+    RAJA::operators::plus<Real_type> binary_op;
+    Real_type init_val = 0.0;
+
+    int len = iend - ibegin;
+
+    // Determine temporary device storage requirements
+    void* d_temp_storage = nullptr;
+    size_t temp_storage_bytes = 0;
+    cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
+                                                temp_storage_bytes,
+                                                x+ibegin,
+                                                y+ibegin,
+                                                binary_op,
+                                                init_val,
+                                                len,
+                                                stream));
+
+    // Allocate temporary storage
+    unsigned char* temp_storage;
+    allocCudaDeviceData(temp_storage, temp_storage_bytes);
+    d_temp_storage = temp_storage;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      // Run
+      cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
+                                                  temp_storage_bytes,
+                                                  x+ibegin,
+                                                  y+ibegin,
+                                                  binary_op,
+                                                  init_val,
+                                                  len,
+                                                  stream));
+
+    }
+    stopTimer();
+
+    // Free temporary storage
+    deallocCudaDeviceData(temp_storage);
+
+    SCAN_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    SCAN_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::exclusive_scan< RAJA::cuda_exec<default_gpu_block_size, true /*async*/> >(RAJA_SCAN_ARGS);
+
+    }
+    stopTimer();
+
+    SCAN_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  SCAN : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/algorithm/SCAN-Hip.cpp b/src/algorithm/SCAN-Hip.cpp
new file mode 100644
index 000000000..6ddccb115
--- /dev/null
+++ b/src/algorithm/SCAN-Hip.cpp
@@ -0,0 +1,143 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "SCAN.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#if defined(__HIPCC__)
+#define ROCPRIM_HIP_API 1
+#include "rocprim/device/device_scan.hpp"
+#elif defined(__CUDACC__)
+#include "cub/device/device_scan.cuh"
+#include "cub/util_allocator.cuh"
+#endif
+
+#include "common/HipDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+#define SCAN_DATA_SETUP_HIP \
+  allocAndInitHipDeviceData(x, m_x, iend); \
+  allocAndInitHipDeviceData(y, m_y, iend);
+
+#define SCAN_DATA_TEARDOWN_HIP \
+  getHipDeviceData(m_y, y, iend); \
+  deallocHipDeviceData(x); \
+  deallocHipDeviceData(y);
+
+
+void SCAN::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  SCAN_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    SCAN_DATA_SETUP_HIP;
+
+    hipStream_t stream = 0;
+
+    RAJA::operators::plus<Real_type> binary_op;
+    Real_type init_val = 0.0;
+
+    int len = iend - ibegin;
+
+    // Determine temporary device storage requirements
+    void* d_temp_storage = nullptr;
+    size_t temp_storage_bytes = 0;
+#if defined(__HIPCC__)
+    hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
+                                        temp_storage_bytes,
+                                        x+ibegin,
+                                        y+ibegin,
+                                        init_val,
+                                        len,
+                                        binary_op,
+                                        stream));
+#elif defined(__CUDACC__)
+    hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
+                                               temp_storage_bytes,
+                                               x+ibegin,
+                                               y+ibegin,
+                                               binary_op,
+                                               init_val,
+                                               len,
+                                               stream));
+#endif
+
+    // Allocate temporary storage
+    unsigned char* temp_storage;
+    allocHipDeviceData(temp_storage, temp_storage_bytes);
+    d_temp_storage = temp_storage;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      // Run
+#if defined(__HIPCC__)
+      hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
+                                          temp_storage_bytes,
+                                          x+ibegin,
+                                          y+ibegin,
+                                          init_val,
+                                          len,
+                                          binary_op,
+                                          stream));
+#elif defined(__CUDACC__)
+      hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
+                                                 temp_storage_bytes,
+                                                 x+ibegin,
+                                                 y+ibegin,
+                                                 binary_op,
+                                                 init_val,
+                                                 len,
+                                                 stream));
+#endif
+
+    }
+    stopTimer();
+
+    // Free temporary storage
+    deallocHipDeviceData(temp_storage);
+
+    SCAN_DATA_TEARDOWN_HIP;
+
+  } else if ( vid == RAJA_HIP ) {
+
+    SCAN_DATA_SETUP_HIP;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::exclusive_scan< RAJA::hip_exec<default_gpu_block_size, true /*async*/> >(RAJA_SCAN_ARGS);
+
+    }
+    stopTimer();
+
+    SCAN_DATA_TEARDOWN_HIP;
+
+  } else {
+     std::cout << "\n  SCAN : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_HIP
diff --git a/src/algorithm/SCAN-OMP.cpp b/src/algorithm/SCAN-OMP.cpp
new file mode 100644
index 000000000..3d21e1e0b
--- /dev/null
+++ b/src/algorithm/SCAN-OMP.cpp
@@ -0,0 +1,189 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "SCAN.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+#include <vector>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+void SCAN::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  SCAN_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_OpenMP : {
+
+#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+#else
+      const Index_type n = iend - ibegin;
+      const int p0 = static_cast<int>(std::min(n, static_cast<Index_type>(omp_get_max_threads())));
+      ::std::vector<Real_type> thread_sums(p0);
+#endif
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        SCAN_PROLOGUE;
+
+#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+        #pragma omp parallel for reduction(inscan, +:scan_var)
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          y[i] = scan_var;
+          #pragma omp scan exclusive(scan_var)
+          scan_var += x[i];
+        }
+#else
+        #pragma omp parallel num_threads(p0)
+        {
+          const int p = omp_get_num_threads();
+          const int pid = omp_get_thread_num();
+          const Index_type step = n / p;
+          const Index_type local_begin = pid * step + ibegin;
+          const Index_type local_end = (pid == p-1) ? iend : (pid+1) * step + ibegin;
+
+          Real_type local_scan_var = (pid == 0) ? scan_var : 0;
+          for (Index_type i = local_begin; i < local_end; ++i ) {
+            y[i] = local_scan_var;
+            local_scan_var += x[i];
+          }
+          thread_sums[pid] = local_scan_var;
+
+          #pragma omp barrier
+
+          if (pid != 0) {
+
+            Real_type prev_sum = 0;
+            for (int ip = 0; ip < pid; ++ip) {
+              prev_sum += thread_sums[ip];
+            }
+
+            for (Index_type i = local_begin; i < local_end; ++i ) {
+              y[i] += prev_sum;
+            }
+          }
+        }
+#endif
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case Lambda_OpenMP : {
+
+#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+        auto scan_lam = [=](Index_type i, Real_type scan_var) {
+                          y[i] = scan_var;
+                          return x[i];
+                        };
+#else
+        auto scan_lam_input = [=](Index_type i) {
+                          return x[i];
+                        };
+        auto scan_lam_sum_output = [=](Index_type i, Real_type sum_var) {
+                          y[i] += sum_var;
+                        };
+        auto scan_lam_output = [=](Index_type i, Real_type scan_var) {
+                          y[i] = scan_var;
+                        };
+
+        const Index_type n = iend - ibegin;
+        const int p0 = static_cast<int>(std::min(n, static_cast<Index_type>(omp_get_max_threads())));
+        ::std::vector<Real_type> thread_sums(p0);
+#endif
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+
+        SCAN_PROLOGUE;
+
+#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+        #pragma omp parallel for reduction(inscan, +:scan_var)
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          #pragma omp scan exclusive(scan_var)
+          scan_var += scan_lam(i, scan_var);
+        }
+#else
+        #pragma omp parallel num_threads(p0)
+        {
+          const int p = omp_get_num_threads();
+          const int pid = omp_get_thread_num();
+          const Index_type step = n / p;
+          const Index_type local_begin = pid * step + ibegin;
+          const Index_type local_end = (pid == p-1) ? iend : (pid+1) * step + ibegin;
+
+          Real_type local_scan_var = (pid == 0) ? scan_var : 0;
+          for (Index_type i = local_begin; i < local_end; ++i ) {
+            scan_lam_output(i, local_scan_var);
+            local_scan_var += scan_lam_input(i);
+          }
+          thread_sums[pid] = local_scan_var;
+
+          #pragma omp barrier
+
+          if (pid != 0) {
+            Real_type prev_sum = 0;
+            for (int ip = 0; ip < pid; ++ip) {
+              prev_sum += thread_sums[ip];
+            }
+
+            for (Index_type i = local_begin; i < local_end; ++i ) {
+              scan_lam_sum_output(i, prev_sum);
+            }
+          }
+        }
+#endif
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_OpenMP : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::exclusive_scan<RAJA::omp_parallel_for_exec>(RAJA_SCAN_ARGS);
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    default : {
+      std::cout << "\n  SCAN : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
diff --git a/src/algorithm/SCAN-OMPTarget.cpp b/src/algorithm/SCAN-OMPTarget.cpp
new file mode 100644
index 000000000..16a1fc1fc
--- /dev/null
+++ b/src/algorithm/SCAN-OMPTarget.cpp
@@ -0,0 +1,85 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "SCAN.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+#include <vector>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) \
+ && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+
+  //
+  // Define threads per team for target execution
+  //
+  const size_t threads_per_team = 256;
+
+#define SCAN_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+  \
+  allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(y, m_y, iend, did, hid);
+
+#define SCAN_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_y, y, iend, hid, did); \
+  deallocOpenMPDeviceData(x, did); \
+  deallocOpenMPDeviceData(y, did);
+
+#endif
+
+
+void SCAN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) \
+ && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  SCAN_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_OpenMPTarget : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        SCAN_PROLOGUE;
+
+        #pragma omp target is_device_ptr(x,y) device( did )
+        #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) \
+                                                  reduction(inscan, +:scan_var)
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          y[i] = scan_var;
+          #pragma omp scan exclusive(scan_var)
+          scan_var += x[i];
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
diff --git a/src/algorithm/SCAN-Seq.cpp b/src/algorithm/SCAN-Seq.cpp
new file mode 100644
index 000000000..b658ca41d
--- /dev/null
+++ b/src/algorithm/SCAN-Seq.cpp
@@ -0,0 +1,90 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "SCAN.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+
+void SCAN::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  SCAN_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        SCAN_PROLOGUE;
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          SCAN_BODY;
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+#if defined(RUN_RAJA_SEQ)
+    case Lambda_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        SCAN_PROLOGUE;
+        auto scan_lam = [=, &scan_var](Index_type i) {
+                          SCAN_BODY;
+                        };
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          scan_lam(i);
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::exclusive_scan<RAJA::loop_exec>(RAJA_SCAN_ARGS);
+
+      }
+      stopTimer();
+
+      break;
+    }
+#endif
+
+    default : {
+      std::cout << "\n  SCAN : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp
new file mode 100644
index 000000000..7a4d9091c
--- /dev/null
+++ b/src/algorithm/SCAN.cpp
@@ -0,0 +1,83 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "SCAN.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "common/DataUtils.hpp"
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+
+SCAN::SCAN(const RunParams& params)
+  : KernelBase(rajaperf::Algorithm_SCAN, params)
+{
+  setDefaultProblemSize(1000000);
+  setDefaultReps(100);
+
+  setActualProblemSize( getTargetProblemSize() );
+
+  setItsPerRep( getActualProblemSize() );
+  setKernelsPerRep(1);
+  setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() );
+  setFLOPsPerRep(1 * getActualProblemSize());
+
+  checksum_scale_factor = 1e-2 *
+                 ( static_cast<Checksum_type>(getDefaultProblemSize()) /
+                                              getActualProblemSize() ) /
+                 getActualProblemSize();
+
+  setUsesFeature(Scan);
+
+  setVariantDefined( Base_Seq );
+  setVariantDefined( Lambda_Seq );
+  setVariantDefined( RAJA_Seq );
+
+  setVariantDefined( Base_OpenMP );
+  setVariantDefined( Lambda_OpenMP );
+  setVariantDefined( RAJA_OpenMP );
+
+#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+  setVariantDefined( Base_OpenMPTarget );
+#endif
+
+  setVariantDefined( Base_CUDA );
+  setVariantDefined( RAJA_CUDA );
+
+  setVariantDefined( Base_HIP );
+  setVariantDefined( RAJA_HIP );
+}
+
+SCAN::~SCAN()
+{
+}
+
+void SCAN::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  allocAndInitDataRandValue(m_x, getActualProblemSize(), vid);
+  allocAndInitDataConst(m_y, getActualProblemSize(), 0.0, vid);
+}
+
+void SCAN::updateChecksum(VariantID vid, size_t tune_idx)
+{
+  checksum[vid][tune_idx] += calcChecksum(m_y, getActualProblemSize(), checksum_scale_factor);
+}
+
+void SCAN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  (void) vid;
+  deallocData(m_x);
+  deallocData(m_y);
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
diff --git a/src/algorithm/SCAN.hpp b/src/algorithm/SCAN.hpp
new file mode 100644
index 000000000..519789a55
--- /dev/null
+++ b/src/algorithm/SCAN.hpp
@@ -0,0 +1,75 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// SCAN kernel reference implementation:
+///
+/// // exclusive scan
+/// y[ibegin] = 0;
+/// for (Index_type i = ibegin+1; i < iend; ++i) {
+///   y[i] = y[i-1] + x[i-1];
+/// }
+///
+
+#ifndef RAJAPerf_Algorithm_SCAN_HPP
+#define RAJAPerf_Algorithm_SCAN_HPP
+
+#define SCAN_DATA_SETUP \
+  Real_ptr x = m_x; \
+  Real_ptr y = m_y;
+
+#define SCAN_PROLOGUE \
+  Real_type scan_var = 0.0;
+
+#define SCAN_BODY \
+  y[i] = scan_var; \
+  scan_var += x[i];
+
+#define RAJA_SCAN_ARGS \
+  RAJA::make_span(x + ibegin, iend - ibegin), \
+  RAJA::make_span(y + ibegin, iend - ibegin)
+
+
+#include "common/KernelBase.hpp"
+
+namespace rajaperf
+{
+class RunParams;
+
+namespace algorithm
+{
+
+class SCAN : public KernelBase
+{
+public:
+
+  SCAN(const RunParams& params);
+
+  ~SCAN();
+
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+private:
+  static const size_t default_gpu_block_size = 0;
+
+  Real_ptr m_x;
+  Real_ptr m_y;
+};
+
+} // end namespace algorithm
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard
diff --git a/src/algorithm/SORT-Cuda.cpp b/src/algorithm/SORT-Cuda.cpp
index 8bdd212de..599a9f246 100644
--- a/src/algorithm/SORT-Cuda.cpp
+++ b/src/algorithm/SORT-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace algorithm
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define SORT_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(x, m_x, iend*run_reps);
 
@@ -35,7 +29,7 @@ namespace algorithm
   deallocCudaDeviceData(x);
 
 
-void SORT::runCudaVariant(VariantID vid)
+void SORT::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -50,7 +44,7 @@ void SORT::runCudaVariant(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::sort< RAJA::cuda_exec<block_size, true /*async*/> >(RAJA_SORT_ARGS);
+      RAJA::sort< RAJA::cuda_exec<default_gpu_block_size, true /*async*/> >(RAJA_SORT_ARGS);
 
     }
     stopTimer();
@@ -58,7 +52,7 @@ void SORT::runCudaVariant(VariantID vid)
     SORT_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  SORT : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  SORT : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/algorithm/SORT-Hip.cpp b/src/algorithm/SORT-Hip.cpp
index c551aeac6..6c80fbc97 100644
--- a/src/algorithm/SORT-Hip.cpp
+++ b/src/algorithm/SORT-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace algorithm
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define SORT_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(x, m_x, iend*run_reps);
 
@@ -35,7 +29,7 @@ namespace algorithm
   deallocHipDeviceData(x);
 
 
-void SORT::runHipVariant(VariantID vid)
+void SORT::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -50,7 +44,7 @@ void SORT::runHipVariant(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::sort< RAJA::hip_exec<block_size, true /*async*/> >(RAJA_SORT_ARGS);
+      RAJA::sort< RAJA::hip_exec<default_gpu_block_size, true /*async*/> >(RAJA_SORT_ARGS);
 
     }
     stopTimer();
@@ -58,7 +52,7 @@ void SORT::runHipVariant(VariantID vid)
     SORT_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  SORT : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  SORT : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/algorithm/SORT-OMP.cpp b/src/algorithm/SORT-OMP.cpp
index 534c1edd3..f62a7dbf6 100644
--- a/src/algorithm/SORT-OMP.cpp
+++ b/src/algorithm/SORT-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,7 +18,7 @@ namespace algorithm
 {
 
 
-void SORT::runOpenMPVariant(VariantID vid)
+void SORT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -44,12 +44,12 @@ void SORT::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  SORT : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  SORT : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/algorithm/SORT-Seq.cpp b/src/algorithm/SORT-Seq.cpp
index e6dcc48a4..ebba50994 100644
--- a/src/algorithm/SORT-Seq.cpp
+++ b/src/algorithm/SORT-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,7 +18,7 @@ namespace algorithm
 {
 
 
-void SORT::runSeqVariant(VariantID vid)
+void SORT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -57,7 +57,7 @@ void SORT::runSeqVariant(VariantID vid)
 #endif
 
     default : {
-      std::cout << "\n  SORT : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  SORT : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/algorithm/SORT.cpp b/src/algorithm/SORT.cpp
index d9d659482..b9722c4d7 100644
--- a/src/algorithm/SORT.cpp
+++ b/src/algorithm/SORT.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -47,17 +47,17 @@ SORT::~SORT()
 {
 }
 
-void SORT::setUp(VariantID vid)
+void SORT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataRandValue(m_x, getActualProblemSize()*getRunReps(), vid);
 }
 
-void SORT::updateChecksum(VariantID vid)
+void SORT::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_x, getActualProblemSize()*getRunReps());
+  checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize()*getRunReps());
 }
 
-void SORT::tearDown(VariantID vid)
+void SORT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_x);
diff --git a/src/algorithm/SORT.hpp b/src/algorithm/SORT.hpp
index f576bee97..0670c9dd0 100644
--- a/src/algorithm/SORT.hpp
+++ b/src/algorithm/SORT.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -42,20 +42,22 @@ class SORT : public KernelBase
 
   ~SORT();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid)
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
   {
-    std::cout << "\n  SORT : Unknown OMP Target variant id = " << vid << std::endl;
+    getCout() << "\n  SORT : Unknown OMP Target variant id = " << vid << std::endl;
   }
 
 private:
+  static const size_t default_gpu_block_size = 0;
+
   Real_ptr m_x;
 };
 
diff --git a/src/algorithm/SORTPAIRS-Cuda.cpp b/src/algorithm/SORTPAIRS-Cuda.cpp
index 7402e880a..0c09bfe1f 100644
--- a/src/algorithm/SORTPAIRS-Cuda.cpp
+++ b/src/algorithm/SORTPAIRS-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace algorithm
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define SORTPAIRS_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(x, m_x, iend*run_reps); \
   allocAndInitCudaDeviceData(i, m_i, iend*run_reps);
@@ -38,7 +32,7 @@ namespace algorithm
   deallocCudaDeviceData(i);
 
 
-void SORTPAIRS::runCudaVariant(VariantID vid)
+void SORTPAIRS::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -53,7 +47,7 @@ void SORTPAIRS::runCudaVariant(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::sort_pairs< RAJA::cuda_exec<block_size, true /*async*/> >(RAJA_SORTPAIRS_ARGS);
+      RAJA::sort_pairs< RAJA::cuda_exec<default_gpu_block_size, true /*async*/> >(RAJA_SORTPAIRS_ARGS);
 
     }
     stopTimer();
@@ -61,7 +55,7 @@ void SORTPAIRS::runCudaVariant(VariantID vid)
     SORTPAIRS_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  SORTPAIRS : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  SORTPAIRS : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/algorithm/SORTPAIRS-Hip.cpp b/src/algorithm/SORTPAIRS-Hip.cpp
index c1a2a54c4..fbdbc660d 100644
--- a/src/algorithm/SORTPAIRS-Hip.cpp
+++ b/src/algorithm/SORTPAIRS-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace algorithm
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define SORTPAIRS_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(x, m_x, iend*run_reps); \
   allocAndInitHipDeviceData(i, m_i, iend*run_reps);
@@ -38,7 +32,7 @@ namespace algorithm
   deallocHipDeviceData(i);
 
 
-void SORTPAIRS::runHipVariant(VariantID vid)
+void SORTPAIRS::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -53,7 +47,7 @@ void SORTPAIRS::runHipVariant(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::sort_pairs< RAJA::hip_exec<block_size, true /*async*/> >(RAJA_SORTPAIRS_ARGS);
+      RAJA::sort_pairs< RAJA::hip_exec<default_gpu_block_size, true /*async*/> >(RAJA_SORTPAIRS_ARGS);
 
     }
     stopTimer();
@@ -61,7 +55,7 @@ void SORTPAIRS::runHipVariant(VariantID vid)
     SORTPAIRS_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  SORTPAIRS : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  SORTPAIRS : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/algorithm/SORTPAIRS-OMP.cpp b/src/algorithm/SORTPAIRS-OMP.cpp
index 8cb5a90dc..5fabe18da 100644
--- a/src/algorithm/SORTPAIRS-OMP.cpp
+++ b/src/algorithm/SORTPAIRS-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,7 +18,7 @@ namespace algorithm
 {
 
 
-void SORTPAIRS::runOpenMPVariant(VariantID vid)
+void SORTPAIRS::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -44,12 +44,12 @@ void SORTPAIRS::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  SORTPAIRS : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  SORTPAIRS : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/algorithm/SORTPAIRS-Seq.cpp b/src/algorithm/SORTPAIRS-Seq.cpp
index 6131c9649..64fee5d1e 100644
--- a/src/algorithm/SORTPAIRS-Seq.cpp
+++ b/src/algorithm/SORTPAIRS-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,7 +21,7 @@ namespace algorithm
 {
 
 
-void SORTPAIRS::runSeqVariant(VariantID vid)
+void SORTPAIRS::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -77,7 +77,7 @@ void SORTPAIRS::runSeqVariant(VariantID vid)
 #endif
 
     default : {
-      std::cout << "\n  SORTPAIRS : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  SORTPAIRS : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/algorithm/SORTPAIRS.cpp b/src/algorithm/SORTPAIRS.cpp
index 7f2e59cbb..df175844e 100644
--- a/src/algorithm/SORTPAIRS.cpp
+++ b/src/algorithm/SORTPAIRS.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -47,19 +47,19 @@ SORTPAIRS::~SORTPAIRS()
 {
 }
 
-void SORTPAIRS::setUp(VariantID vid)
+void SORTPAIRS::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataRandValue(m_x, getActualProblemSize()*getRunReps(), vid);
   allocAndInitDataRandValue(m_i, getActualProblemSize()*getRunReps(), vid);
 }
 
-void SORTPAIRS::updateChecksum(VariantID vid)
+void SORTPAIRS::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_x, getActualProblemSize()*getRunReps());
-  checksum[vid] += calcChecksum(m_i, getActualProblemSize()*getRunReps());
+  checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize()*getRunReps());
+  checksum[vid][tune_idx] += calcChecksum(m_i, getActualProblemSize()*getRunReps());
 }
 
-void SORTPAIRS::tearDown(VariantID vid)
+void SORTPAIRS::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_x);
diff --git a/src/algorithm/SORTPAIRS.hpp b/src/algorithm/SORTPAIRS.hpp
index b6f03005f..658d3ad4b 100644
--- a/src/algorithm/SORTPAIRS.hpp
+++ b/src/algorithm/SORTPAIRS.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -41,20 +41,22 @@ class SORTPAIRS : public KernelBase
 
   ~SORTPAIRS();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid)
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
   {
-    std::cout << "\n  SORTPAIRS : Unknown OMP Target variant id = " << vid << std::endl;
+    getCout() << "\n  SORTPAIRS : Unknown OMP Target variant id = " << vid << std::endl;
   }
 
 private:
+  static const size_t default_gpu_block_size = 0;
+
   Real_ptr m_x;
   Real_ptr m_i;
 };
diff --git a/src/apps/AppsData.cpp b/src/apps/AppsData.cpp
index 52231d8b9..390412aa3 100644
--- a/src/apps/AppsData.cpp
+++ b/src/apps/AppsData.cpp
@@ -1,11 +1,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#include "common/RAJAPerfSuite.hpp"
 #include "AppsData.hpp"
 
 #include <iostream>
@@ -23,7 +24,7 @@ void setMeshPositions_2d(Real_ptr x, Real_type dx,
                          const ADomain& domain)
 {
   if (domain.ndims != 2) {
-    std::cout << "\n******* ERROR!!! domain is not 2d *******" << std::endl;
+    getCout() << "\n******* ERROR!!! domain is not 2d *******" << std::endl;
     return;
   }
 
@@ -34,8 +35,8 @@ void setMeshPositions_2d(Real_ptr x, Real_type dx,
 
   Index_type jp = domain.jp;
 
-  Index_type npnl = domain.NPNL; 
-  Index_type npnr = domain.NPNR; 
+  Index_type npnl = domain.NPNL;
+  Index_type npnr = domain.NPNR;
 
   Real_ptr x1, x2, x3, x4;
   Real_ptr y1, y2, y3, y4;
@@ -66,7 +67,7 @@ void setMeshPositions_3d(Real_ptr x, Real_type dx,
                          const ADomain& domain)
 {
   if (domain.ndims != 3) {
-    std::cout << "\n******* ERROR!!! domain is not 3d *******" << std::endl;
+    getCout() << "\n******* ERROR!!! domain is not 3d *******" << std::endl;
     return;
   }
 
@@ -80,8 +81,8 @@ void setMeshPositions_3d(Real_ptr x, Real_type dx,
   Index_type jp = domain.jp;
   Index_type kp = domain.kp;
 
-  Index_type npnl = domain.NPNL; 
-  Index_type npnr = domain.NPNR; 
+  Index_type npnl = domain.NPNL;
+  Index_type npnr = domain.NPNR;
 
   Real_ptr x0, x1, x2, x3, x4, x5, x6, x7;
   Real_ptr y0, y1, y2, y3, y4, y5, y6, y7;
diff --git a/src/apps/AppsData.hpp b/src/apps/AppsData.hpp
index f1616968c..f9b9251e9 100644
--- a/src/apps/AppsData.hpp
+++ b/src/apps/AppsData.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt
index 16b822cbb..cf1ed84ea 100644
--- a/src/apps/CMakeLists.txt
+++ b/src/apps/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
@@ -15,6 +15,12 @@ blt_add_library(
           DEL_DOT_VEC_2D-Cuda.cpp 
           DEL_DOT_VEC_2D-OMP.cpp 
           DEL_DOT_VEC_2D-OMPTarget.cpp 
+          DIFFUSION3DPA.cpp
+          DIFFUSION3DPA-Cuda.cpp
+          DIFFUSION3DPA-Hip.cpp
+          DIFFUSION3DPA-Seq.cpp
+          DIFFUSION3DPA-OMP.cpp
+          DIFFUSION3DPA-OMPTarget.cpp
           ENERGY.cpp
           ENERGY-Seq.cpp
           ENERGY-Hip.cpp 
@@ -57,6 +63,12 @@ blt_add_library(
           MASS3DPA-Seq.cpp
           MASS3DPA-OMP.cpp
           MASS3DPA-OMPTarget.cpp
+          NODAL_ACCUMULATION_3D.cpp
+          NODAL_ACCUMULATION_3D-Seq.cpp
+          NODAL_ACCUMULATION_3D-Hip.cpp
+          NODAL_ACCUMULATION_3D-Cuda.cpp
+          NODAL_ACCUMULATION_3D-OMP.cpp
+          NODAL_ACCUMULATION_3D-OMPTarget.cpp
           PRESSURE.cpp 
           PRESSURE-Seq.cpp 
           PRESSURE-Hip.cpp 
diff --git a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp
index 82973db44..c19d0770e 100644
--- a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp
+++ b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -25,12 +25,6 @@ namespace rajaperf
 namespace apps
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define DEL_DOT_VEC_2D_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(x, m_x, m_array_length); \
   allocAndInitCudaDeviceData(y, m_y, m_array_length); \
@@ -48,6 +42,8 @@ namespace apps
   deallocCudaDeviceData(div); \
   deallocCudaDeviceData(real_zones);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void deldotvec2d(Real_ptr div,
                             const Real_ptr x1, const Real_ptr x2,
                             const Real_ptr x3, const Real_ptr x4,
@@ -61,7 +57,7 @@ __global__ void deldotvec2d(Real_ptr div,
                             const Real_type half, const Real_type ptiny,
                             Index_type iend)
 {
-   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type ii = blockIdx.x * block_size + threadIdx.x;
    if (ii < iend) {
      DEL_DOT_VEC_2D_BODY_INDEX;
      DEL_DOT_VEC_2D_BODY;
@@ -69,7 +65,8 @@ __global__ void deldotvec2d(Real_ptr div,
 }
 
 
-void DEL_DOT_VEC_2D::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type iend = m_domain->n_real_zones;
@@ -90,7 +87,7 @@ void DEL_DOT_VEC_2D::runCudaVariant(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
 
-      deldotvec2d<<<grid_size, block_size>>>(div,
+      deldotvec2d<block_size><<<grid_size, block_size>>>(div,
                                              x1, x2, x3, x4,
                                              y1, y2, y3, y4,
                                              fx1, fx2, fx3, fx4,
@@ -119,7 +116,7 @@ void DEL_DOT_VEC_2D::runCudaVariant(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
 
-      lambda_cuda_forall<<<grid_size, block_size>>>(
+      lambda_cuda_forall<block_size><<<grid_size, block_size>>>(
         0, iend,
         [=] __device__ (Index_type ii) {
 
@@ -142,7 +139,7 @@ void DEL_DOT_VEC_2D::runCudaVariant(VariantID vid)
     NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ;
     NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ;
 
-    camp::resources::Resource working_res{camp::resources::Cuda()};
+    camp::resources::Resource working_res{camp::resources::Cuda::get_default()};
     RAJA::TypedListSegment<Index_type> zones(m_domain->real_zones,
                                              m_domain->n_real_zones,
                                              working_res);
@@ -161,10 +158,12 @@ void DEL_DOT_VEC_2D::runCudaVariant(VariantID vid)
     DEL_DOT_VEC_2D_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  DEL_DOT_VEC_2D : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  DEL_DOT_VEC_2D : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DEL_DOT_VEC_2D, Cuda)
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp
index ba97858c8..782e4099c 100644
--- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp
+++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -25,12 +25,6 @@ namespace rajaperf
 namespace apps
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define DEL_DOT_VEC_2D_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(x, m_x, m_array_length); \
   allocAndInitHipDeviceData(y, m_y, m_array_length); \
@@ -48,6 +42,8 @@ namespace apps
   deallocHipDeviceData(div); \
   deallocHipDeviceData(real_zones);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void deldotvec2d(Real_ptr div,
                             const Real_ptr x1, const Real_ptr x2,
                             const Real_ptr x3, const Real_ptr x4,
@@ -61,7 +57,7 @@ __global__ void deldotvec2d(Real_ptr div,
                             const Real_type half, const Real_type ptiny,
                             Index_type iend)
 {
-   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type ii = blockIdx.x * block_size + threadIdx.x;
    if (ii < iend) {
      DEL_DOT_VEC_2D_BODY_INDEX;
      DEL_DOT_VEC_2D_BODY;
@@ -69,7 +65,8 @@ __global__ void deldotvec2d(Real_ptr div,
 }
 
 
-void DEL_DOT_VEC_2D::runHipVariant(VariantID vid)
+template < size_t block_size >
+void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type iend = m_domain->n_real_zones;
@@ -90,7 +87,7 @@ void DEL_DOT_VEC_2D::runHipVariant(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
 
-      hipLaunchKernelGGL((deldotvec2d), dim3(grid_size), dim3(block_size), 0, 0, div,
+      hipLaunchKernelGGL((deldotvec2d<block_size>), dim3(grid_size), dim3(block_size), 0, 0, div,
                                              x1, x2, x3, x4,
                                              y1, y2, y3, y4,
                                              fx1, fx2, fx3, fx4,
@@ -125,7 +122,7 @@ void DEL_DOT_VEC_2D::runHipVariant(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
 
-      hipLaunchKernelGGL(lambda_hip_forall<decltype(deldotvec2d_lambda)>,
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(deldotvec2d_lambda)>),
         grid_size, block_size, 0, 0,
         0, iend, deldotvec2d_lambda);
       hipErrchk( hipGetLastError() );
@@ -144,7 +141,7 @@ void DEL_DOT_VEC_2D::runHipVariant(VariantID vid)
     NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ;
     NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ;
 
-    camp::resources::Resource working_res{camp::resources::Hip()};
+    camp::resources::Resource working_res{camp::resources::Hip::get_default()};
     RAJA::TypedListSegment<Index_type> zones(m_domain->real_zones,
                                              m_domain->n_real_zones,
                                              working_res);
@@ -163,10 +160,12 @@ void DEL_DOT_VEC_2D::runHipVariant(VariantID vid)
     DEL_DOT_VEC_2D_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  DEL_DOT_VEC_2D : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  DEL_DOT_VEC_2D : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DEL_DOT_VEC_2D, Hip)
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/DEL_DOT_VEC_2D-OMP.cpp b/src/apps/DEL_DOT_VEC_2D-OMP.cpp
index 75e640459..91250c796 100644
--- a/src/apps/DEL_DOT_VEC_2D-OMP.cpp
+++ b/src/apps/DEL_DOT_VEC_2D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,13 +16,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
 
 
-void DEL_DOT_VEC_2D::runOpenMPVariant(VariantID vid)
+void DEL_DOT_VEC_2D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -44,7 +44,7 @@ void DEL_DOT_VEC_2D::runOpenMPVariant(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        #pragma omp parallel for 
+        #pragma omp parallel for
         for (Index_type ii = ibegin ; ii < iend ; ++ii ) {
           DEL_DOT_VEC_2D_BODY_INDEX;
           DEL_DOT_VEC_2D_BODY;
@@ -79,7 +79,7 @@ void DEL_DOT_VEC_2D::runOpenMPVariant(VariantID vid)
 
     case RAJA_OpenMP : {
 
-      camp::resources::Resource working_res{camp::resources::Host()};
+      camp::resources::Resource working_res{camp::resources::Host::get_default()};
       RAJA::TypedListSegment<Index_type> zones(m_domain->real_zones,
                                                m_domain->n_real_zones,
                                                working_res);
@@ -100,12 +100,12 @@ void DEL_DOT_VEC_2D::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  DEL_DOT_VEC_2D : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  DEL_DOT_VEC_2D : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp
index b9196680d..479ab795e 100644
--- a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp
+++ b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -20,7 +20,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
@@ -51,7 +51,7 @@ namespace apps
   deallocOpenMPDeviceData(real_zones, did);
 
 
-void DEL_DOT_VEC_2D::runOpenMPTargetVariant(VariantID vid)
+void DEL_DOT_VEC_2D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -62,7 +62,7 @@ void DEL_DOT_VEC_2D::runOpenMPTargetVariant(VariantID vid)
   if ( vid == Base_OpenMPTarget ) {
 
     DEL_DOT_VEC_2D_DATA_SETUP_OMP_TARGET;
-     
+
     NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ;
     NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ;
     NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ;
@@ -74,7 +74,7 @@ void DEL_DOT_VEC_2D::runOpenMPTargetVariant(VariantID vid)
       #pragma omp target is_device_ptr(x1,x2,x3,x4, y1,y2,y3,y4, \
                                        fx1,fx2,fx3,fx4, fy1,fy2,fy3,fy4, \
                                        div, real_zones) device( did )
-      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) 
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
       for (Index_type ii = ibegin ; ii < iend ; ++ii ) {
         DEL_DOT_VEC_2D_BODY_INDEX;
         DEL_DOT_VEC_2D_BODY;
@@ -88,13 +88,13 @@ void DEL_DOT_VEC_2D::runOpenMPTargetVariant(VariantID vid)
   } else if ( vid == RAJA_OpenMPTarget ) {
 
     DEL_DOT_VEC_2D_DATA_SETUP_OMP_TARGET;
-     
+
     NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ;
     NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ;
     NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ;
     NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ;
 
-    camp::resources::Resource working_res{camp::resources::Omp()};
+    camp::resources::Resource working_res{camp::resources::Omp::get_default()};
     RAJA::TypedListSegment<Index_type> zones(m_domain->real_zones,
                                              m_domain->n_real_zones,
                                              working_res);
@@ -114,7 +114,7 @@ void DEL_DOT_VEC_2D::runOpenMPTargetVariant(VariantID vid)
     DEL_DOT_VEC_2D_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  DEL_DOT_VEC_2D : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  DEL_DOT_VEC_2D : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/apps/DEL_DOT_VEC_2D-Seq.cpp b/src/apps/DEL_DOT_VEC_2D-Seq.cpp
index bd6b0884b..07100da04 100644
--- a/src/apps/DEL_DOT_VEC_2D-Seq.cpp
+++ b/src/apps/DEL_DOT_VEC_2D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,13 +16,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
 
 
-void DEL_DOT_VEC_2D::runSeqVariant(VariantID vid)
+void DEL_DOT_VEC_2D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -51,7 +51,7 @@ void DEL_DOT_VEC_2D::runSeqVariant(VariantID vid)
       stopTimer();
 
       break;
-    } 
+    }
 
 #if defined(RUN_RAJA_SEQ)
     case Lambda_Seq : {
@@ -76,9 +76,9 @@ void DEL_DOT_VEC_2D::runSeqVariant(VariantID vid)
 
     case RAJA_Seq : {
 
-      camp::resources::Resource working_res{camp::resources::Host()};
-      RAJA::TypedListSegment<Index_type> zones(m_domain->real_zones, 
-                                               m_domain->n_real_zones, 
+      camp::resources::Resource working_res{camp::resources::Host::get_default()};
+      RAJA::TypedListSegment<Index_type> zones(m_domain->real_zones,
+                                               m_domain->n_real_zones,
                                                working_res);
 
       auto deldotvec2d_lam = [=](Index_type i) {
@@ -91,14 +91,14 @@ void DEL_DOT_VEC_2D::runSeqVariant(VariantID vid)
         RAJA::forall<RAJA::loop_exec>(zones, deldotvec2d_lam);
 
       }
-      stopTimer(); 
+      stopTimer();
 
       break;
     }
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  DEL_DOT_VEC_2D : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  DEL_DOT_VEC_2D : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp
index cec0af410..24121e157 100644
--- a/src/apps/DEL_DOT_VEC_2D.cpp
+++ b/src/apps/DEL_DOT_VEC_2D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -69,7 +69,7 @@ DEL_DOT_VEC_2D::~DEL_DOT_VEC_2D()
   delete m_domain;
 }
 
-void DEL_DOT_VEC_2D::setUp(VariantID vid)
+void DEL_DOT_VEC_2D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_x, m_array_length, 0.0, vid);
   allocAndInitDataConst(m_y, m_array_length, 0.0, vid);
@@ -87,12 +87,12 @@ void DEL_DOT_VEC_2D::setUp(VariantID vid)
   m_half = 0.5;
 }
 
-void DEL_DOT_VEC_2D::updateChecksum(VariantID vid)
+void DEL_DOT_VEC_2D::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_div, m_array_length);
+  checksum[vid][tune_idx] += calcChecksum(m_div, m_array_length);
 }
 
-void DEL_DOT_VEC_2D::tearDown(VariantID vid)
+void DEL_DOT_VEC_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
 
diff --git a/src/apps/DEL_DOT_VEC_2D.hpp b/src/apps/DEL_DOT_VEC_2D.hpp
index 1a4d7670b..60d577a05 100644
--- a/src/apps/DEL_DOT_VEC_2D.hpp
+++ b/src/apps/DEL_DOT_VEC_2D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -104,17 +104,27 @@ class DEL_DOT_VEC_2D : public KernelBase
 
   ~DEL_DOT_VEC_2D();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_x;
   Real_ptr m_y;
   Real_ptr m_xdot;
diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp
new file mode 100644
index 000000000..9ceafb94c
--- /dev/null
+++ b/src/apps/DIFFUSION3DPA-Cuda.cpp
@@ -0,0 +1,374 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+// Uncomment to add compiler directives for loop unrolling
+//#define USE_RAJAPERF_UNROLL
+
+#include "DIFFUSION3DPA.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf {
+namespace apps {
+
+#define DIFFUSION3DPA_DATA_SETUP_CUDA                                          \
+  allocAndInitCudaDeviceData(Basis, m_B, DPA_Q1D *DPA_D1D);                    \
+  allocAndInitCudaDeviceData(dBasis, m_G, DPA_Q1D *DPA_D1D);                   \
+  allocAndInitCudaDeviceData(D, m_D, DPA_Q1D *DPA_Q1D *DPA_Q1D *SYM *m_NE);    \
+  allocAndInitCudaDeviceData(X, m_X, DPA_D1D *DPA_D1D *DPA_D1D *m_NE);         \
+  allocAndInitCudaDeviceData(Y, m_Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE);
+
+#define DIFFUSION3DPA_DATA_TEARDOWN_CUDA                                       \
+  getCudaDeviceData(m_Y, Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE);                  \
+  deallocCudaDeviceData(Basis);                                                \
+  deallocCudaDeviceData(dBasis);                                               \
+  deallocCudaDeviceData(D);                                                    \
+  deallocCudaDeviceData(X);                                                    \
+  deallocCudaDeviceData(Y);
+
+template < size_t block_size >
+  __launch_bounds__(block_size)
+__global__ void Diffusion3DPA(const Real_ptr Basis,
+                              const Real_ptr dBasis, const Real_ptr D,
+                              const Real_ptr X, Real_ptr Y, bool symmetric) {
+
+  const int e = blockIdx.x;
+
+  DIFFUSION3DPA_0_GPU;
+
+  GPU_FOREACH_THREAD(dz, z, DPA_D1D) {
+    GPU_FOREACH_THREAD(dy, y, DPA_D1D) {
+      GPU_FOREACH_THREAD(dx, x, DPA_D1D) {
+        DIFFUSION3DPA_1;
+      }
+    }
+  }
+
+  if (threadIdx.z == 0) {
+    GPU_FOREACH_THREAD(dy, y, DPA_D1D) {
+      GPU_FOREACH_THREAD(qx, x, DPA_Q1D) {
+        DIFFUSION3DPA_2;
+      }
+    }
+  }
+  __syncthreads();
+  GPU_FOREACH_THREAD(dz, z, DPA_D1D) {
+    GPU_FOREACH_THREAD(dy, y, DPA_D1D) {
+      GPU_FOREACH_THREAD(qx, x, DPA_Q1D) {
+        DIFFUSION3DPA_3;
+      }
+    }
+  }
+  __syncthreads();
+  GPU_FOREACH_THREAD(dz, z, DPA_D1D) {
+    GPU_FOREACH_THREAD(qy, y, DPA_Q1D) {
+      GPU_FOREACH_THREAD(qx, x, DPA_Q1D) {
+        DIFFUSION3DPA_4;
+      }
+    }
+  }
+  __syncthreads();
+  GPU_FOREACH_THREAD(qz, z, DPA_Q1D) {
+    GPU_FOREACH_THREAD(qy, y, DPA_Q1D) {
+      GPU_FOREACH_THREAD(qx, x, DPA_Q1D) {
+        DIFFUSION3DPA_5;
+      }
+    }
+  }
+  __syncthreads();
+  if (threadIdx.z == 0) {
+    GPU_FOREACH_THREAD(d, y, DPA_D1D) {
+      GPU_FOREACH_THREAD(q, x, DPA_Q1D) {
+        DIFFUSION3DPA_6;
+      }
+    }
+  }
+  __syncthreads();
+  GPU_FOREACH_THREAD(qz, z, DPA_Q1D) {
+    GPU_FOREACH_THREAD(qy, y, DPA_Q1D) {
+      GPU_FOREACH_THREAD(dx, x, DPA_D1D) {
+        DIFFUSION3DPA_7;
+      }
+    }
+  }
+  __syncthreads();
+  GPU_FOREACH_THREAD(qz, z, DPA_Q1D) {
+    GPU_FOREACH_THREAD(dy, y, DPA_D1D) {
+      GPU_FOREACH_THREAD(dx, x, DPA_D1D) {
+        DIFFUSION3DPA_8;
+      }
+    }
+  }
+  __syncthreads();
+  GPU_FOREACH_THREAD(dz, z, DPA_D1D) {
+    GPU_FOREACH_THREAD(dy, y, DPA_D1D) {
+      GPU_FOREACH_THREAD(dx, x, DPA_D1D) {
+        DIFFUSION3DPA_9;
+      }
+    }
+  }
+}
+
+template < size_t block_size >
+void DIFFUSION3DPA::runCudaVariantImpl(VariantID vid) {
+  const Index_type run_reps = getRunReps();
+
+  DIFFUSION3DPA_DATA_SETUP;
+
+  switch (vid) {
+
+  case Base_CUDA: {
+
+    DIFFUSION3DPA_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D);
+
+      Diffusion3DPA<block_size><<<NE, nthreads_per_block>>>(
+          Basis, dBasis, D, X, Y, symmetric);
+
+      cudaErrchk(cudaGetLastError());
+    }
+    stopTimer();
+
+    DIFFUSION3DPA_DATA_TEARDOWN_CUDA;
+
+    break;
+  }
+
+  case RAJA_CUDA: {
+
+    DIFFUSION3DPA_DATA_SETUP_CUDA;
+
+    constexpr bool async = true;
+
+    using launch_policy =
+        RAJA::expt::LaunchPolicy<RAJA::expt::cuda_launch_t<async, DPA_Q1D*DPA_Q1D*DPA_Q1D>>;
+
+    using outer_x =
+        RAJA::expt::LoopPolicy<RAJA::cuda_block_x_direct>;
+
+    using inner_x =
+        RAJA::expt::LoopPolicy<RAJA::cuda_thread_x_loop>;
+
+    using inner_y =
+        RAJA::expt::LoopPolicy<RAJA::cuda_thread_y_loop>;
+
+    using inner_z =
+        RAJA::expt::LoopPolicy<RAJA::cuda_thread_z_loop>;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::expt::launch<launch_policy>(
+          RAJA::expt::Grid(RAJA::expt::Teams(NE),
+                           RAJA::expt::Threads(DPA_Q1D, DPA_Q1D, DPA_Q1D)),
+          [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
+
+          RAJA::expt::loop<outer_x>(ctx, RAJA::RangeSegment(0, NE),
+            [&](int e) {
+
+              DIFFUSION3DPA_0_GPU;
+
+              RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                [&](int dz) {
+                  RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                    [&](int dy) {
+                      RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                        [&](int dx) {
+
+                          DIFFUSION3DPA_1;
+
+                        } // lambda (dx)
+                      ); // RAJA::expt::loop<inner_x>
+                    } // lambda (dy)
+                  );  //RAJA::expt::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::expt::loop<inner_z>
+
+              ctx.teamSync();
+
+              RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, 1),
+                [&](int RAJA_UNUSED_ARG(dz)) {
+                  RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                    [&](int dy) {
+                      RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                        [&](int qx) {
+
+                          DIFFUSION3DPA_2;
+
+                        } // lambda (qx)
+                      ); // RAJA::expt::loop<inner_x>
+                    } // lambda (dy)
+                  );  //RAJA::expt::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::expt::loop<inner_z>
+
+              ctx.teamSync();
+
+              RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                [&](int dz) {
+                  RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                    [&](int dy) {
+                      RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                        [&](int qx) {
+
+                          DIFFUSION3DPA_3;
+
+                        } // lambda (qx)
+                      ); // RAJA::expt::loop<inner_x>
+                    } // lambda (dy)
+                  );  //RAJA::expt::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::expt::loop<inner_z>
+
+              ctx.teamSync();
+
+              RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                [&](int dz) {
+                  RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                    [&](int qy) {
+                      RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                        [&](int qx) {
+
+                          DIFFUSION3DPA_4;
+
+                        } // lambda (qx)
+                      ); // RAJA::expt::loop<inner_x>
+                    } // lambda (qy)
+                  );  //RAJA::expt::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::expt::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+               [&](int qz) {
+                 RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                   [&](int qy) {
+                     RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                       [&](int qx) {
+
+                         DIFFUSION3DPA_5;
+
+                       } // lambda (qx)
+                     ); // RAJA::expt::loop<inner_x>
+                   } // lambda (qy)
+                 );  //RAJA::expt::loop<inner_y>
+               } // lambda (qz)
+             );  //RAJA::expt::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, 1),
+               [&](int RAJA_UNUSED_ARG(dz)) {
+                 RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                   [&](int d) {
+                     RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                       [&](int q) {
+
+                         DIFFUSION3DPA_6;
+
+                       } // lambda (q)
+                     ); // RAJA::expt::loop<inner_x>
+                   } // lambda (d)
+                 );  //RAJA::expt::loop<inner_y>
+               } // lambda (dz)
+             );  //RAJA::expt::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+               [&](int qz) {
+                 RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                   [&](int qy) {
+                     RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                       [&](int dx) {
+
+                         DIFFUSION3DPA_7;
+
+                       } // lambda (dx)
+                     ); // RAJA::expt::loop<inner_x>
+                   } // lambda (qy)
+                 );  //RAJA::expt::loop<inner_y>
+               } // lambda (qz)
+             );  //RAJA::expt::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+               [&](int qz) {
+                 RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                   [&](int dy) {
+                     RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                       [&](int dx) {
+
+                         DIFFUSION3DPA_8;
+
+                       } // lambda (dx)
+                     ); // RAJA::expt::loop<inner_x>
+                   } // lambda (dy)
+                 );  //RAJA::expt::loop<inner_y>
+               } // lambda (qz)
+             );  //RAJA::expt::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+               [&](int dz) {
+                 RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                   [&](int dy) {
+                     RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                       [&](int dx) {
+
+                         DIFFUSION3DPA_9;
+
+                       } // lambda (dx)
+                     ); // RAJA::expt::loop<inner_x>
+                   } // lambda (dy)
+                 );  //RAJA::expt::loop<inner_y>
+               } // lambda (dz)
+             );  //RAJA::expt::loop<inner_z>
+
+            } // lambda (e)
+          ); // RAJA::expt::loop<outer_x>
+
+        }  // outer lambda (ctx)
+      );  // RAJA::expt::launch
+
+    } // loop over kernel reps
+    stopTimer();
+
+    DIFFUSION3DPA_DATA_TEARDOWN_CUDA;
+
+    break;
+  }
+
+  default: {
+
+    getCout() << "\n DIFFUSION3DPA : Unknown Cuda variant id = " << vid
+              << std::endl;
+    break;
+  }
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DIFFUSION3DPA, Cuda)
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif // RAJA_ENABLE_CUDA
diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp
new file mode 100644
index 000000000..58a40a77e
--- /dev/null
+++ b/src/apps/DIFFUSION3DPA-Hip.cpp
@@ -0,0 +1,376 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+// Uncomment to add compiler directives for loop unrolling
+//#define USE_RAJAPERF_UNROLL
+
+#include "DIFFUSION3DPA.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#include "common/HipDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf {
+namespace apps {
+
+#define DIFFUSION3DPA_DATA_SETUP_HIP                                           \
+  allocAndInitHipDeviceData(Basis, m_B, DPA_Q1D *DPA_D1D);                     \
+  allocAndInitHipDeviceData(dBasis, m_G, DPA_Q1D *DPA_D1D);                    \
+  allocAndInitHipDeviceData(D, m_D, DPA_Q1D *DPA_Q1D *DPA_Q1D *SYM *m_NE);     \
+  allocAndInitHipDeviceData(X, m_X, DPA_D1D *DPA_D1D *DPA_D1D *m_NE);          \
+  allocAndInitHipDeviceData(Y, m_Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE);
+
+#define DIFFUSION3DPA_DATA_TEARDOWN_HIP                                        \
+  getHipDeviceData(m_Y, Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE);                   \
+  deallocHipDeviceData(Basis);                                                 \
+  deallocHipDeviceData(dBasis);                                                \
+  deallocHipDeviceData(D);                                                     \
+  deallocHipDeviceData(X);                                                     \
+  deallocHipDeviceData(Y);
+
+template < size_t block_size >
+  __launch_bounds__(block_size)
+__global__ void Diffusion3DPA(const Real_ptr Basis,
+                              const Real_ptr dBasis, const Real_ptr D,
+                              const Real_ptr X, Real_ptr Y, bool symmetric) {
+
+  const int e = hipBlockIdx_x;
+
+  DIFFUSION3DPA_0_GPU;
+
+  GPU_FOREACH_THREAD(dz, z, DPA_D1D) {
+    GPU_FOREACH_THREAD(dy, y, DPA_D1D) {
+      GPU_FOREACH_THREAD(dx, x, DPA_D1D) {
+        DIFFUSION3DPA_1;
+      }
+    }
+  }
+
+  if (threadIdx.z == 0) {
+    GPU_FOREACH_THREAD(dy, y, DPA_D1D) {
+      GPU_FOREACH_THREAD(qx, x, DPA_Q1D) {
+        DIFFUSION3DPA_2;
+      }
+    }
+  }
+  __syncthreads();
+  GPU_FOREACH_THREAD(dz, z, DPA_D1D) {
+    GPU_FOREACH_THREAD(dy, y, DPA_D1D) {
+      GPU_FOREACH_THREAD(qx, x, DPA_Q1D) {
+        DIFFUSION3DPA_3;
+      }
+    }
+  }
+  __syncthreads();
+  GPU_FOREACH_THREAD(dz, z, DPA_D1D) {
+    GPU_FOREACH_THREAD(qy, y, DPA_Q1D) {
+      GPU_FOREACH_THREAD(qx, x, DPA_Q1D) {
+        DIFFUSION3DPA_4;
+      }
+    }
+  }
+  __syncthreads();
+  GPU_FOREACH_THREAD(qz, z, DPA_Q1D) {
+    GPU_FOREACH_THREAD(qy, y, DPA_Q1D) {
+      GPU_FOREACH_THREAD(qx, x, DPA_Q1D) {
+        DIFFUSION3DPA_5;
+      }
+    }
+  }
+  __syncthreads();
+  if (threadIdx.z == 0) {
+    GPU_FOREACH_THREAD(d, y, DPA_D1D) {
+      GPU_FOREACH_THREAD(q, x, DPA_Q1D) {
+        DIFFUSION3DPA_6;
+      }
+    }
+  }
+  __syncthreads();
+  GPU_FOREACH_THREAD(qz, z, DPA_Q1D) {
+    GPU_FOREACH_THREAD(qy, y, DPA_Q1D) {
+      GPU_FOREACH_THREAD(dx, x, DPA_D1D) {
+        DIFFUSION3DPA_7;
+      }
+    }
+  }
+  __syncthreads();
+  GPU_FOREACH_THREAD(qz, z, DPA_Q1D) {
+    GPU_FOREACH_THREAD(dy, y, DPA_D1D) {
+      GPU_FOREACH_THREAD(dx, x, DPA_D1D) {
+        DIFFUSION3DPA_8;
+      }
+    }
+  }
+  __syncthreads();
+  GPU_FOREACH_THREAD(dz, z, DPA_D1D) {
+    GPU_FOREACH_THREAD(dy, y, DPA_D1D) {
+      GPU_FOREACH_THREAD(dx, x, DPA_D1D) {
+        DIFFUSION3DPA_9;
+      }
+    }
+  }
+}
+
+template < size_t block_size >
+void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) {
+  const Index_type run_reps = getRunReps();
+
+  DIFFUSION3DPA_DATA_SETUP;
+
+  switch (vid) {
+
+  case Base_HIP: {
+
+    DIFFUSION3DPA_DATA_SETUP_HIP;
+
+    dim3 nblocks(NE);
+    dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      hipLaunchKernelGGL((Diffusion3DPA<block_size>),
+          dim3(nblocks), dim3(nthreads_per_block), 0, 0,
+          Basis, dBasis, D, X, Y, symmetric);
+
+      hipErrchk(hipGetLastError());
+    }
+    stopTimer();
+
+    DIFFUSION3DPA_DATA_TEARDOWN_HIP;
+
+    break;
+  }
+
+  case RAJA_HIP: {
+
+    DIFFUSION3DPA_DATA_SETUP_HIP;
+
+    constexpr bool async = true;
+
+    using launch_policy =
+        RAJA::expt::LaunchPolicy<RAJA::expt::hip_launch_t<async, DPA_Q1D*DPA_Q1D*DPA_Q1D>>;
+
+    using outer_x =
+        RAJA::expt::LoopPolicy<RAJA::hip_block_x_direct>;
+
+    using inner_x =
+        RAJA::expt::LoopPolicy<RAJA::hip_thread_x_loop>;
+
+    using inner_y =
+        RAJA::expt::LoopPolicy<RAJA::hip_thread_y_loop>;
+
+    using inner_z =
+        RAJA::expt::LoopPolicy<RAJA::hip_thread_z_loop>;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::expt::launch<launch_policy>(
+          RAJA::expt::Grid(RAJA::expt::Teams(NE),
+                           RAJA::expt::Threads(DPA_Q1D, DPA_Q1D, DPA_Q1D)),
+          [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
+
+          RAJA::expt::loop<outer_x>(ctx, RAJA::RangeSegment(0, NE),
+            [&](int e) {
+
+              DIFFUSION3DPA_0_GPU;
+
+              RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                [&](int dz) {
+                  RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                    [&](int dy) {
+                      RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                        [&](int dx) {
+
+                          DIFFUSION3DPA_1;
+
+                        } // lambda (dx)
+                      ); // RAJA::expt::loop<inner_x>
+                    } // lambda (dy)
+                  );  //RAJA::expt::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::expt::loop<inner_z>
+
+              ctx.teamSync();
+
+              RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, 1),
+                [&](int RAJA_UNUSED_ARG(dz)) {
+                  RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                    [&](int dy) {
+                      RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                        [&](int qx) {
+
+                          DIFFUSION3DPA_2;
+
+                        } // lambda (qx)
+                      ); // RAJA::expt::loop<inner_x>
+                    } // lambda (dy)
+                  );  //RAJA::expt::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::expt::loop<inner_z>
+
+              ctx.teamSync();
+
+              RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                [&](int dz) {
+                  RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                    [&](int dy) {
+                      RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                        [&](int qx) {
+
+                          DIFFUSION3DPA_3;
+
+                        } // lambda (qx)
+                      ); // RAJA::expt::loop<inner_x>
+                    } // lambda (dy)
+                  );  //RAJA::expt::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::expt::loop<inner_z>
+
+              ctx.teamSync();
+
+              RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                [&](int dz) {
+                  RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                    [&](int qy) {
+                      RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                        [&](int qx) {
+
+                          DIFFUSION3DPA_4;
+
+                        } // lambda (qx)
+                      ); // RAJA::expt::loop<inner_x>
+                    } // lambda (qy)
+                  );  //RAJA::expt::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::expt::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+               [&](int qz) {
+                 RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                   [&](int qy) {
+                     RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                       [&](int qx) {
+
+                         DIFFUSION3DPA_5;
+
+                       } // lambda (qx)
+                     ); // RAJA::expt::loop<inner_x>
+                   } // lambda (qy)
+                 );  //RAJA::expt::loop<inner_y>
+               } // lambda (qz)
+             );  //RAJA::expt::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, 1),
+               [&](int RAJA_UNUSED_ARG(dz)) {
+                 RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                   [&](int d) {
+                     RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                       [&](int q) {
+
+                         DIFFUSION3DPA_6;
+
+                       } // lambda (q)
+                     ); // RAJA::expt::loop<inner_x>
+                   } // lambda (d)
+                 );  //RAJA::expt::loop<inner_y>
+               } // lambda (dz)
+             );  //RAJA::expt::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+               [&](int qz) {
+                 RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                   [&](int qy) {
+                     RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                       [&](int dx) {
+
+                         DIFFUSION3DPA_7;
+
+                       } // lambda (dx)
+                     ); // RAJA::expt::loop<inner_x>
+                   } // lambda (qy)
+                 );  //RAJA::expt::loop<inner_y>
+               } // lambda (qz)
+             );  //RAJA::expt::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+               [&](int qz) {
+                 RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                   [&](int dy) {
+                     RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                       [&](int dx) {
+
+                         DIFFUSION3DPA_8;
+
+                       } // lambda (dx)
+                     ); // RAJA::expt::loop<inner_x>
+                   } // lambda (dy)
+                 );  //RAJA::expt::loop<inner_y>
+               } // lambda (qz)
+             );  //RAJA::expt::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+               [&](int dz) {
+                 RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                   [&](int dy) {
+                     RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                       [&](int dx) {
+
+                         DIFFUSION3DPA_9;
+
+                       } // lambda (dx)
+                     ); // RAJA::expt::loop<inner_x>
+                   } // lambda (dy)
+                 );  //RAJA::expt::loop<inner_y>
+               } // lambda (dz)
+             );  //RAJA::expt::loop<inner_z>
+
+            } // lambda (e)
+          ); // RAJA::expt::loop<outer_x>
+
+        }  // outer lambda (ctx)
+      );  // RAJA::expt::launch
+
+    } // loop over kernel reps
+    stopTimer();
+
+    DIFFUSION3DPA_DATA_TEARDOWN_HIP;
+
+    break;
+  }
+
+  default: {
+
+    getCout() << "\n DIFFUSION3DPA : Unknown Hip variant id = " << vid
+              << std::endl;
+    break;
+  }
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DIFFUSION3DPA, Hip)
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif // RAJA_ENABLE_HIP
diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp
new file mode 100644
index 000000000..7d32b2b41
--- /dev/null
+++ b/src/apps/DIFFUSION3DPA-OMP.cpp
@@ -0,0 +1,323 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+// Uncomment to add compiler directives for loop unrolling
+//#define USE_RAJAPERF_UNROLL
+
+#include "DIFFUSION3DPA.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf {
+namespace apps {
+
+void DIFFUSION3DPA::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
+
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+  DIFFUSION3DPA_DATA_SETUP;
+
+  switch (vid) {
+
+  case Base_OpenMP: {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+#pragma omp parallel for
+      for (int e = 0; e < NE; ++e) {
+
+        DIFFUSION3DPA_0_CPU;
+
+        CPU_FOREACH(dz, z, DPA_D1D) {
+          CPU_FOREACH(dy, y, DPA_D1D) {
+            CPU_FOREACH(dx, x, DPA_D1D) {
+              DIFFUSION3DPA_1;
+            }
+          }
+        }
+
+        CPU_FOREACH(dy, y, DPA_D1D) {
+          CPU_FOREACH(qx, x, DPA_Q1D) {
+            DIFFUSION3DPA_2;
+          }
+        }
+
+        CPU_FOREACH(dz, z, DPA_D1D) {
+          CPU_FOREACH(dy, y, DPA_D1D) {
+            CPU_FOREACH(qx, x, DPA_Q1D) {
+              DIFFUSION3DPA_3;
+            }
+          }
+        }
+
+        CPU_FOREACH(dz, z, DPA_D1D) {
+          CPU_FOREACH(qy, y, DPA_Q1D) {
+            CPU_FOREACH(qx, x, DPA_Q1D) {
+              DIFFUSION3DPA_4;
+            }
+          }
+        }
+
+        CPU_FOREACH(qz, z, DPA_Q1D) {
+          CPU_FOREACH(qy, y, DPA_Q1D) {
+            CPU_FOREACH(qx, x, DPA_Q1D) {
+              DIFFUSION3DPA_5;
+            }
+          }
+        }
+
+        CPU_FOREACH(d, y, DPA_D1D) {
+          CPU_FOREACH(q, x, DPA_Q1D) {
+            DIFFUSION3DPA_6;
+          }
+        }
+
+        CPU_FOREACH(qz, z, DPA_Q1D) {
+          CPU_FOREACH(qy, y, DPA_Q1D) {
+            CPU_FOREACH(dx, x, DPA_D1D) {
+              DIFFUSION3DPA_7;
+            }
+          }
+        }
+
+        CPU_FOREACH(qz, z, DPA_Q1D) {
+          CPU_FOREACH(dy, y, DPA_D1D) {
+            CPU_FOREACH(dx, x, DPA_D1D) {
+              DIFFUSION3DPA_8;
+            }
+          }
+        }
+
+        CPU_FOREACH(dz, z, DPA_D1D) {
+          CPU_FOREACH(dy, y, DPA_D1D) {
+            CPU_FOREACH(dx, x, DPA_D1D) {
+              DIFFUSION3DPA_9;
+            }
+          }
+        }
+
+      } // element loop
+    }
+    stopTimer();
+
+    break;
+  }
+
+  case RAJA_OpenMP: {
+
+    // Currently Teams requires two policies if compiled with a device
+    using launch_policy = RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t>;
+
+    using outer_x = RAJA::expt::LoopPolicy<RAJA::omp_for_exec>;
+
+    using inner_x = RAJA::expt::LoopPolicy<RAJA::loop_exec>;
+
+    using inner_y = RAJA::expt::LoopPolicy<RAJA::loop_exec>;
+
+    using inner_z = RAJA::expt::LoopPolicy<RAJA::loop_exec>;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      // Grid is empty as the host does not need a compute grid to be specified
+      RAJA::expt::launch<launch_policy>(
+          RAJA::expt::Grid(),
+          [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
+
+          RAJA::expt::loop<outer_x>(ctx, RAJA::RangeSegment(0, NE),
+            [&](int e) {
+
+              DIFFUSION3DPA_0_CPU;
+
+              RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                [&](int dz) {
+                  RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                    [&](int dy) {
+                      RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                        [&](int dx) {
+
+                          DIFFUSION3DPA_1;
+
+                        } // lambda (dx)
+                      ); // RAJA::expt::loop<inner_x>
+                    } // lambda (dy)
+                  );  //RAJA::expt::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::expt::loop<inner_z>
+
+              ctx.teamSync();
+
+              RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, 1),
+                [&](int RAJA_UNUSED_ARG(dz)) {
+                  RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                    [&](int dy) {
+                      RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                        [&](int qx) {
+
+                          DIFFUSION3DPA_2;
+
+                        } // lambda (qx)
+                      ); // RAJA::expt::loop<inner_x>
+                    } // lambda (dy)
+                  );  //RAJA::expt::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::expt::loop<inner_z>
+
+              ctx.teamSync();
+
+              RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                [&](int dz) {
+                  RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                    [&](int dy) {
+                      RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                        [&](int qx) {
+
+                          DIFFUSION3DPA_3;
+
+                        } // lambda (qx)
+                      ); // RAJA::expt::loop<inner_x>
+                    } // lambda (dy)
+                  );  //RAJA::expt::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::expt::loop<inner_z>
+
+              ctx.teamSync();
+
+              RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                [&](int dz) {
+                  RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                    [&](int qy) {
+                      RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                        [&](int qx) {
+
+                          DIFFUSION3DPA_4;
+
+                        } // lambda (qx)
+                      ); // RAJA::expt::loop<inner_x>
+                    } // lambda (qy)
+                  );  //RAJA::expt::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::expt::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+               [&](int qz) {
+                 RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                   [&](int qy) {
+                     RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                       [&](int qx) {
+
+                         DIFFUSION3DPA_5;
+
+                       } // lambda (qx)
+                     ); // RAJA::expt::loop<inner_x>
+                   } // lambda (qy)
+                 );  //RAJA::expt::loop<inner_y>
+               } // lambda (qz)
+             );  //RAJA::expt::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, 1),
+               [&](int RAJA_UNUSED_ARG(dz)) {
+                 RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                   [&](int d) {
+                     RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                       [&](int q) {
+
+                         DIFFUSION3DPA_6;
+
+                       } // lambda (q)
+                     ); // RAJA::expt::loop<inner_x>
+                   } // lambda (d)
+                 );  //RAJA::expt::loop<inner_y>
+               } // lambda (dz)
+             );  //RAJA::expt::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+               [&](int qz) {
+                 RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                   [&](int qy) {
+                     RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                       [&](int dx) {
+
+                         DIFFUSION3DPA_7;
+
+                       } // lambda (dx)
+                     ); // RAJA::expt::loop<inner_x>
+                   } // lambda (qy)
+                 );  //RAJA::expt::loop<inner_y>
+               } // lambda (qz)
+             );  //RAJA::expt::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+               [&](int qz) {
+                 RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                   [&](int dy) {
+                     RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                       [&](int dx) {
+
+                         DIFFUSION3DPA_8;
+
+                       } // lambda (dx)
+                     ); // RAJA::expt::loop<inner_x>
+                   } // lambda (dy)
+                 );  //RAJA::expt::loop<inner_y>
+               } // lambda (qz)
+             );  //RAJA::expt::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+               [&](int dz) {
+                 RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                   [&](int dy) {
+                     RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                       [&](int dx) {
+
+                         DIFFUSION3DPA_9;
+
+                       } // lambda (dx)
+                     ); // RAJA::expt::loop<inner_x>
+                   } // lambda (dy)
+                 );  //RAJA::expt::loop<inner_y>
+               } // lambda (dz)
+             );  //RAJA::expt::loop<inner_z>
+
+            } // lambda (e)
+          ); // RAJA::expt::loop<outer_x>
+
+        }  // outer lambda (ctx)
+      );  // RAJA::expt::launch
+    }  // loop over kernel reps
+    stopTimer();
+
+    return;
+  }
+
+  default:
+    getCout() << "\n DIFFUSION3DPA : Unknown OpenMP variant id = " << vid
+              << std::endl;
+  }
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+} // end namespace apps
+} // end namespace rajaperf
diff --git a/src/apps/DIFFUSION3DPA-OMPTarget.cpp b/src/apps/DIFFUSION3DPA-OMPTarget.cpp
new file mode 100644
index 000000000..16cff1087
--- /dev/null
+++ b/src/apps/DIFFUSION3DPA-OMPTarget.cpp
@@ -0,0 +1,39 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "DIFFUSION3DPA.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf {
+namespace apps {
+
+void DIFFUSION3DPA::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
+  const Index_type run_reps = getRunReps();
+
+  switch (vid) {
+
+  default: {
+
+    getCout() << "\n DIFFUSION3DPA : Unknown OpenMPTarget variant id = " << vid
+              << std::endl;
+    break;
+  }
+  }
+}
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp
new file mode 100644
index 000000000..a84b4bc61
--- /dev/null
+++ b/src/apps/DIFFUSION3DPA-Seq.cpp
@@ -0,0 +1,318 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+// Uncomment to add compiler directives for loop unrolling
+//#define USE_RAJAPERF_UNROLL
+
+#include "DIFFUSION3DPA.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf {
+namespace apps {
+
+void DIFFUSION3DPA::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
+  const Index_type run_reps = getRunReps();
+
+  DIFFUSION3DPA_DATA_SETUP;
+
+  switch (vid) {
+
+  case Base_Seq: {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (int e = 0; e < NE; ++e) {
+
+        DIFFUSION3DPA_0_CPU;
+
+        CPU_FOREACH(dz, z, DPA_D1D) {
+          CPU_FOREACH(dy, y, DPA_D1D) {
+            CPU_FOREACH(dx, x, DPA_D1D) {
+              DIFFUSION3DPA_1;
+            }
+          }
+        }
+
+        CPU_FOREACH(dy, y, DPA_D1D) {
+          CPU_FOREACH(qx, x, DPA_Q1D) {
+            DIFFUSION3DPA_2;
+          }
+        }
+
+        CPU_FOREACH(dz, z, DPA_D1D) {
+          CPU_FOREACH(dy, y, DPA_D1D) {
+            CPU_FOREACH(qx, x, DPA_Q1D) {
+              DIFFUSION3DPA_3;
+            }
+          }
+        }
+
+        CPU_FOREACH(dz, z, DPA_D1D) {
+          CPU_FOREACH(qy, y, DPA_Q1D) {
+            CPU_FOREACH(qx, x, DPA_Q1D) {
+              DIFFUSION3DPA_4;
+            }
+          }
+        }
+
+        CPU_FOREACH(qz, z, DPA_Q1D) {
+          CPU_FOREACH(qy, y, DPA_Q1D) {
+            CPU_FOREACH(qx, x, DPA_Q1D) {
+              DIFFUSION3DPA_5;
+            }
+          }
+        }
+
+        CPU_FOREACH(d, y, DPA_D1D) {
+          CPU_FOREACH(q, x, DPA_Q1D) {
+            DIFFUSION3DPA_6;
+          }
+        }
+
+        CPU_FOREACH(qz, z, DPA_Q1D) {
+          CPU_FOREACH(qy, y, DPA_Q1D) {
+            CPU_FOREACH(dx, x, DPA_D1D) {
+              DIFFUSION3DPA_7;
+            }
+          }
+        }
+
+        CPU_FOREACH(qz, z, DPA_Q1D) {
+          CPU_FOREACH(dy, y, DPA_D1D) {
+            CPU_FOREACH(dx, x, DPA_D1D) {
+              DIFFUSION3DPA_8;
+            }
+          }
+        }
+
+        CPU_FOREACH(dz, z, DPA_D1D) {
+          CPU_FOREACH(dy, y, DPA_D1D) {
+            CPU_FOREACH(dx, x, DPA_D1D) {
+              DIFFUSION3DPA_9;
+            }
+          }
+        }
+
+      } // element loop
+    }
+    stopTimer();
+
+    break;
+  }
+
+#if defined(RUN_RAJA_SEQ)
+  case RAJA_Seq: {
+
+    // Currently Teams requires two policies if compiled with a device
+    using launch_policy = RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t>;
+
+    using outer_x = RAJA::expt::LoopPolicy<RAJA::loop_exec>;
+
+    using inner_x = RAJA::expt::LoopPolicy<RAJA::loop_exec>;
+
+    using inner_y = RAJA::expt::LoopPolicy<RAJA::loop_exec>;
+
+    using inner_z = RAJA::expt::LoopPolicy<RAJA::loop_exec>;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      // Grid is empty as the host does not need a compute grid to be specified
+      RAJA::expt::launch<launch_policy>(
+          RAJA::expt::Grid(),
+          [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
+
+          RAJA::expt::loop<outer_x>(ctx, RAJA::RangeSegment(0, NE),
+            [&](int e) {
+
+              DIFFUSION3DPA_0_CPU;
+
+              RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                [&](int dz) {
+                  RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                    [&](int dy) {
+                      RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                        [&](int dx) {
+
+                          DIFFUSION3DPA_1;
+
+                        } // lambda (dx)
+                      ); // RAJA::expt::loop<inner_x>
+                    } // lambda (dy)
+                  );  //RAJA::expt::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::expt::loop<inner_z>
+
+              ctx.teamSync();
+
+              RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, 1),
+                [&](int RAJA_UNUSED_ARG(dz)) {
+                  RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                    [&](int dy) {
+                      RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                        [&](int qx) {
+
+                          DIFFUSION3DPA_2;
+
+                        } // lambda (qx)
+                      ); // RAJA::expt::loop<inner_x>
+                    } // lambda (dy)
+                  );  //RAJA::expt::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::expt::loop<inner_z>
+
+              ctx.teamSync();
+
+              RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                [&](int dz) {
+                  RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                    [&](int dy) {
+                      RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                        [&](int qx) {
+
+                          DIFFUSION3DPA_3;
+
+                        } // lambda (qx)
+                      ); // RAJA::expt::loop<inner_x>
+                    } // lambda (dy)
+                  );  //RAJA::expt::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::expt::loop<inner_z>
+
+              ctx.teamSync();
+
+              RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                [&](int dz) {
+                  RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                    [&](int qy) {
+                      RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                        [&](int qx) {
+
+                          DIFFUSION3DPA_4;
+
+                        } // lambda (qx)
+                      ); // RAJA::expt::loop<inner_x>
+                    } // lambda (qy)
+                  );  //RAJA::expt::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::expt::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+               [&](int qz) {
+                 RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                   [&](int qy) {
+                     RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                       [&](int qx) {
+
+                         DIFFUSION3DPA_5;
+
+                       } // lambda (qx)
+                     ); // RAJA::expt::loop<inner_x>
+                   } // lambda (qy)
+                 );  //RAJA::expt::loop<inner_y>
+               } // lambda (qz)
+             );  //RAJA::expt::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, 1),
+               [&](int RAJA_UNUSED_ARG(dz)) {
+                 RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                   [&](int d) {
+                     RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                       [&](int q) {
+
+                         DIFFUSION3DPA_6;
+
+                       } // lambda (q)
+                     ); // RAJA::expt::loop<inner_x>
+                   } // lambda (d)
+                 );  //RAJA::expt::loop<inner_y>
+               } // lambda (dz)
+             );  //RAJA::expt::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+               [&](int qz) {
+                 RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                   [&](int qy) {
+                     RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                       [&](int dx) {
+
+                         DIFFUSION3DPA_7;
+
+                       } // lambda (dx)
+                     ); // RAJA::expt::loop<inner_x>
+                   } // lambda (qy)
+                 );  //RAJA::expt::loop<inner_y>
+               } // lambda (qz)
+             );  //RAJA::expt::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+               [&](int qz) {
+                 RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                   [&](int dy) {
+                     RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                       [&](int dx) {
+
+                         DIFFUSION3DPA_8;
+
+                       } // lambda (dx)
+                     ); // RAJA::expt::loop<inner_x>
+                   } // lambda (dy)
+                 );  //RAJA::expt::loop<inner_y>
+               } // lambda (qz)
+             );  //RAJA::expt::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::expt::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+               [&](int dz) {
+                 RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                   [&](int dy) {
+                     RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                       [&](int dx) {
+
+                         DIFFUSION3DPA_9;
+
+                       } // lambda (dx)
+                     ); // RAJA::expt::loop<inner_x>
+                   } // lambda (dy)
+                 );  //RAJA::expt::loop<inner_y>
+               } // lambda (dz)
+             );  //RAJA::expt::loop<inner_z>
+
+            } // lambda (e)
+          ); // RAJA::expt::loop<outer_x>
+
+        }  // outer lambda (ctx)
+      );  // RAJA::expt::launch
+    }  // loop over kernel reps
+    stopTimer();
+
+    return;
+  }
+#endif // RUN_RAJA_SEQ
+
+  default:
+    getCout() << "\n DIFFUSION3DPA : Unknown Seq variant id = " << vid
+              << std::endl;
+  }
+}
+
+} // end namespace apps
+} // end namespace rajaperf
diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp
new file mode 100644
index 000000000..3844668c6
--- /dev/null
+++ b/src/apps/DIFFUSION3DPA.cpp
@@ -0,0 +1,101 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "DIFFUSION3DPA.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "common/DataUtils.hpp"
+
+#include <algorithm>
+
+namespace rajaperf
+{
+namespace apps
+{
+
+
+DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params)
+  : KernelBase(rajaperf::Apps_DIFFUSION3DPA, params)
+{
+  m_NE_default = 15625;
+
+  setDefaultProblemSize(m_NE_default*DPA_Q1D*DPA_Q1D*DPA_Q1D);
+  setDefaultReps(50);
+
+  m_NE = std::max(getTargetProblemSize()/(DPA_Q1D*DPA_Q1D*DPA_Q1D), Index_type(1));
+
+  setActualProblemSize( m_NE*DPA_Q1D*DPA_Q1D*DPA_Q1D );
+
+  setItsPerRep(getActualProblemSize());
+  setKernelsPerRep(1);
+
+  setBytesPerRep( 2*DPA_Q1D*DPA_D1D*sizeof(Real_type)  +
+                  DPA_Q1D*DPA_Q1D*DPA_Q1D*SYM*m_NE*sizeof(Real_type) +
+                  DPA_D1D*DPA_D1D*DPA_D1D*m_NE*sizeof(Real_type) +
+                  DPA_D1D*DPA_D1D*DPA_D1D*m_NE*sizeof(Real_type) );
+
+  setFLOPsPerRep(m_NE * (DPA_Q1D * DPA_D1D +
+                         5 * DPA_D1D * DPA_D1D * DPA_Q1D * DPA_D1D +
+                         7 * DPA_D1D * DPA_D1D * DPA_Q1D * DPA_Q1D +
+                         7 * DPA_Q1D * DPA_D1D * DPA_Q1D * DPA_Q1D +
+                         15 * DPA_Q1D * DPA_Q1D * DPA_Q1D +
+                         DPA_Q1D * DPA_D1D +
+                         7 * DPA_Q1D * DPA_Q1D * DPA_D1D * DPA_Q1D +
+                         7 * DPA_Q1D * DPA_Q1D * DPA_D1D * DPA_D1D +
+                         7 * DPA_D1D * DPA_Q1D * DPA_D1D * DPA_D1D +
+                         3 * DPA_D1D * DPA_D1D * DPA_D1D));
+
+  setUsesFeature(Teams);
+
+  setVariantDefined( Base_Seq );
+  setVariantDefined( RAJA_Seq );
+
+  setVariantDefined( Base_OpenMP );
+  setVariantDefined( RAJA_OpenMP );
+
+  setVariantDefined( Base_CUDA );
+  setVariantDefined( RAJA_CUDA );
+
+  setVariantDefined( Base_HIP );
+  setVariantDefined( RAJA_HIP );
+
+}
+
+DIFFUSION3DPA::~DIFFUSION3DPA()
+{
+}
+
+void DIFFUSION3DPA::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+
+  allocAndInitDataConst(m_B, int(DPA_Q1D*DPA_D1D), Real_type(1.0), vid);
+  allocAndInitDataConst(m_G, int(DPA_Q1D*DPA_D1D), Real_type(1.0), vid);
+  allocAndInitDataConst(m_D, int(DPA_Q1D*DPA_Q1D*DPA_Q1D*SYM*m_NE), Real_type(1.0), vid);
+  allocAndInitDataConst(m_X, int(DPA_D1D*DPA_D1D*DPA_D1D*m_NE), Real_type(1.0), vid);
+  allocAndInitDataConst(m_Y, int(DPA_D1D*DPA_D1D*DPA_D1D*m_NE), Real_type(0.0), vid);
+}
+
+void DIFFUSION3DPA::updateChecksum(VariantID vid, size_t tune_idx)
+{
+  checksum[vid][tune_idx] += calcChecksum(m_Y, DPA_D1D*DPA_D1D*DPA_D1D*m_NE);
+}
+
+void DIFFUSION3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  (void) vid;
+
+  deallocData(m_B);
+  deallocData(m_G);
+  deallocData(m_D);
+  deallocData(m_X);
+  deallocData(m_Y);
+}
+
+} // end namespace apps
+} // end namespace rajaperf
diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp
new file mode 100644
index 000000000..b0ba7c977
--- /dev/null
+++ b/src/apps/DIFFUSION3DPA.hpp
@@ -0,0 +1,511 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Action of 3D diffusion matrix via partial assembly
+///
+/// Based on MFEM's/CEED algorithms.
+/// Reference implementation
+/// https://github.com/mfem/mfem/blob/master/fem/bilininteg_diffusion_pa.cpp
+///
+/// for (int e = 0; e < NE; ++e) {
+///
+///   constexpr int MQ1 = DPA_Q1D;
+///   constexpr int MD1 = DPA_D1D;
+///   constexpr int MDQ = (MQ1 >  ? MQ1 : MD1;
+///   double sBG[MQ1*MD1];
+///   double (*B)[MD1] = (double (*)[MD1]) sBG;
+///   double (*G)[MD1] = (double (*)[MD1]) sBG;
+///   double (*Bt)[MQ1] = (double (*)[MQ1]) sBG;
+///   double (*Gt)[MQ1] = (double (*)[MQ1]) sBG;
+///   double sm0[3][MDQ*MDQ*MDQ];
+///   double sm1[3][MDQ*MDQ*MDQ];
+///   double (*X)[MD1][MD1]    = (double (*)[MD1][MD1]) (sm0+2);
+///   double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0);
+///   double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1);
+///   double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0);
+///   double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1);
+///   double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2);
+///   double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0);
+///   double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1);
+///   double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2);
+///   double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0);
+///   double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1);
+///   double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2);
+///   double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0);
+///   double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1);
+///   double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2);
+///
+///   for(int dz=0;dz<D1D;dz++){
+///     for(int dy=0;dy<D1D;++dy){
+///         for(int dx=0; dx<D1D;++dx){
+///            X[dz][dy][dx] = x(dx,dy,dz,e);
+///         }
+///      }
+///   }
+///
+///   for(int dy=0; dy<D1D; ++dy){
+///     for(int qx=0; qx<Q1D; ++qx){
+///       const int i = qi(qx,dy,Q1D);
+///       const int j = dj(qx,dy,D1D);
+///       const int k = qk(qx,dy,Q1D);
+///       const int l = dl(qx,dy,D1D);
+///       B[i][j] = b(qx,dy);
+///       G[k][l] = g(qx,dy) * sign(qx,dy);
+///     }
+///   }
+///
+///   for(int dz=0;dz<D1D;dz++){
+///     for(int dy=0;dy<D1D;++dy){
+///       for(int qx=0; qx<Q1D; qx++){
+///         double u = 0.0, v = 0.0;
+///         for (int dx = 0; dx < D1D; ++dx){
+///            const int i = qi(qx,dx,Q1D);
+///            const int j = dj(qx,dx,D1D);
+///            const int k = qk(qx,dx,Q1D);
+///            const int l = dl(qx,dx,D1D);
+///            const double s = sign(qx,dx);
+///            const double coords = X[dz][dy][dx];
+///            u += coords * B[i][j];
+///            v += coords * G[k][l] * s;
+///         }
+///         DDQ0[dz][dy][qx] = u;
+///         DDQ1[dz][dy][qx] = v;
+///       }
+///     }
+///   }
+///
+///    for(int dz=0;dz<D1D;dz++){
+///      for(int qy=0;qy<Q1D;++qy){
+///         for(int qx=0; qx<Q1D;++qx){
+///           double u = 0.0, v = 0.0, w = 0.0;
+///           for (int dy = 0; dy < D1D; ++dy){
+///             const int i = qi(qy,dy,Q1D);
+///             const int j = dj(qy,dy,D1D);
+///             const int k = qk(qy,dy,Q1D);
+///             const int l = dl(qy,dy,D1D);
+///             const double s = sign(qy,dy);
+///             u += DDQ1[dz][dy][qx] * B[i][j];
+///             v += DDQ0[dz][dy][qx] * G[k][l] * s;
+///             w += DDQ0[dz][dy][qx] * B[i][j];
+///           }
+///           DQQ0[dz][qy][qx] = u;
+///           DQQ1[dz][qy][qx] = v;
+///           DQQ2[dz][qy][qx] = w;
+///         }
+///      }
+///   }
+///
+///   for(int qz=0;qz<Q1D;qz++){
+///     for(int qy=0;qy<Q1D;++qy){
+///       for(int qx=0; qx<Q1D;++qx){
+///
+///         double u = 0.0, v = 0.0, w = 0.0;
+///         for (int dz = 0; dz < D1D; ++dz){
+///           const int i = qi(qz,dz,Q1D);
+///           const int j = dj(qz,dz,D1D);
+///           const int k = qk(qz,dz,Q1D);
+///           const int l = dl(qz,dz,D1D);
+///           const double s = sign(qz,dz);
+///           u += DQQ0[dz][qy][qx] * B[i][j];
+///           v += DQQ1[dz][qy][qx] * B[i][j];
+///           w += DQQ2[dz][qy][qx] * G[k][l] * s;
+///         }
+///         const double O11 = d(qx,qy,qz,0,e);
+///         const double O12 = d(qx,qy,qz,1,e);
+///         const double O13 = d(qx,qy,qz,2,e);
+///         const double O21 = symmetric ? O12 : d(qx,qy,qz,3,e);
+///         const double O22 = symmetric ? d(qx,qy,qz,3,e) : d(qx,qy,qz,4,e);
+///         const double O23 = symmetric ? d(qx,qy,qz,4,e) : d(qx,qy,qz,5,e);
+///         const double O31 = symmetric ? O13 : d(qx,qy,qz,6,e);
+///         const double O32 = symmetric ? O23 : d(qx,qy,qz,7,e);
+///         const double O33 = symmetric ? d(qx,qy,qz,5,e) : d(qx,qy,qz,8,e);
+///         const double gX = u;
+///         const double gY = v;
+///         const double gZ = w;
+///         QQQ0[qz][qy][qx] = (O11*gX) + (O12*gY) + (O13*gZ);
+///         QQQ1[qz][qy][qx] = (O21*gX) + (O22*gY) + (O23*gZ);
+///         QQQ2[qz][qy][qx] = (O31*gX) + (O32*gY) + (O33*gZ);
+///        }
+///      }
+///    }
+///
+///    for(int d=0; d<D1D; ++d){
+///       for(int q=0,q<Q1D; ++q){
+///         const int i = qi(q,d,Q1D);
+///         const int j = dj(q,d,D1D);
+///         const int k = qk(q,d,Q1D);
+///         const int l = dl(q,d,D1D);
+///         Bt[j][i] = b(q,d);
+///         Gt[l][k] = g(q,d) * sign(q,d);
+///      }
+///     }
+///
+///     for(int qz=0;qz<Q1D;qz++){
+///       for(int qy=0;qy<Q1D;++qy){
+///          for(int dx=0; dx<D1D;++dx){
+///            double u = 0.0, v = 0.0, w = 0.0;
+///            for (int qx = 0; qx < Q1D; ++qx){
+///              const int i = qi(qx,dx,Q1D);
+///              const int j = dj(qx,dx,D1D);
+///              const int k = qk(qx,dx,Q1D);
+///              const int l = dl(qx,dx,D1D);
+///              const double s = sign(qx,dx);
+///              u += QQQ0[qz][qy][qx] * Gt[l][k] * s;
+///              v += QQQ1[qz][qy][qx] * Bt[j][i];
+///              w += QQQ2[qz][qy][qx] * Bt[j][i];
+///            }
+///            QQD0[qz][qy][dx] = u;
+///            QQD1[qz][qy][dx] = v;
+///            QQD2[qz][qy][dx] = w;
+///          }
+///       }
+///     }
+///
+///     for(int qz=0;qz<Q1D;qz++){
+///       for(int dy=0;dy<D1D;++dy){
+///          for(int dx=0; dx<D1D;++dx){
+///          double u = 0.0, v = 0.0, w = 0.0;
+///          for (int qy = 0; qy < Q1D; ++qy){
+///            const int i = qi(qy,dy,Q1D);
+///            const int j = dj(qy,dy,D1D);
+///            const int k = qk(qy,dy,Q1D);
+///            const int l = dl(qy,dy,D1D);
+///            const double s = sign(qy,dy);
+///            u += QQD0[qz][qy][dx] * Bt[j][i];
+///            v += QQD1[qz][qy][dx] * Gt[l][k] * s;
+///            w += QQD2[qz][qy][dx] * Bt[j][i];
+///           }
+///          QDD0[qz][dy][dx] = u;
+///          QDD1[qz][dy][dx] = v;
+///          QDD2[qz][dy][dx] = w;
+///        }
+///      }
+///    }
+///
+///    for(int dz=0;dz<D1D;dz++){
+///      for(int dy=0;dy<D1D;++dy){
+///        for(int dx=0; dx<D1D;++dx){
+///           double u = 0.0, v = 0.0, w = 0.0;
+///           for (int qz = 0; qz < Q1D; ++qz){
+///              const int i = qi(qz,dz,Q1D);
+///               const int j = dj(qz,dz,D1D);
+///               const int k = qk(qz,dz,Q1D);
+///               const int l = dl(qz,dz,D1D);
+///               const double s = sign(qz,dz);
+///               u += QDD0[qz][dy][dx] * Bt[j][i];
+///               v += QDD1[qz][dy][dx] * Bt[j][i];
+///               w += QDD2[qz][dy][dx] * Gt[l][k] * s;
+///            }
+///            y(dx,dy,dz,e) += (u + v + w);
+///         }
+///      }
+///   }
+///
+/// } // element loop
+///
+
+#ifndef RAJAPerf_Apps_DIFFUSION3DPA_HPP
+#define RAJAPerf_Apps_DIFFUSION3DPA_HPP
+
+#define DIFFUSION3DPA_DATA_SETUP \
+Real_ptr Basis = m_B; \
+Real_ptr dBasis = m_G; \
+Real_ptr D = m_D; \
+Real_ptr X = m_X; \
+Real_ptr Y = m_Y; \
+Index_type NE = m_NE; \
+const bool symmetric = true;
+
+#include "common/KernelBase.hpp"
+#include "FEM_MACROS.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+//Number of Dofs/Qpts in 1D
+#define DPA_D1D 3
+#define DPA_Q1D 4
+#define SYM 6
+#define b(x, y) Basis[x + DPA_Q1D * y]
+#define g(x, y) dBasis[x + DPA_Q1D * y]
+#define dpaX_(dx, dy, dz, e)                                                      \
+  X[dx + DPA_D1D * dy + DPA_D1D * DPA_D1D * dz + DPA_D1D * DPA_D1D * DPA_D1D * e]
+#define dpaY_(dx, dy, dz, e)                                                      \
+  Y[dx + DPA_D1D * dy + DPA_D1D * DPA_D1D * dz + DPA_D1D * DPA_D1D * DPA_D1D * e]
+#define d(qx, qy, qz, s, e)                                                    \
+  D[qx + DPA_Q1D * qy + DPA_Q1D * DPA_Q1D * qz + DPA_Q1D * DPA_Q1D * DPA_Q1D * s  +  DPA_Q1D * DPA_Q1D * DPA_Q1D * SYM * e]
+
+// Half of B and G are stored in shared to get B, Bt, G and Gt.
+// Indices computation for SmemPADiffusionApply3D.
+static RAJA_HOST_DEVICE inline int qi(const int q, const int d, const int Q)
+{
+  return (q<=d) ? q : Q-1-q;
+}
+
+static RAJA_HOST_DEVICE inline int dj(const int q, const int d, const int D)
+{
+  return (q<=d) ? d : D-1-d;
+}
+
+static RAJA_HOST_DEVICE inline int qk(const int q, const int d, const int Q)
+{
+  return (q<=d) ? Q-1-q : q;
+}
+
+static RAJA_HOST_DEVICE inline int dl(const int q, const int d, const int D)
+{
+  return (q<=d) ? D-1-d : d;
+}
+
+static RAJA_HOST_DEVICE inline double sign(const int q, const int d)
+{
+  return (q<=d) ? -1.0 : 1.0;
+}
+
+#define DIFFUSION3DPA_0_GPU \
+        constexpr int MQ1 = DPA_Q1D; \
+        constexpr int MD1 = DPA_D1D; \
+        constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \
+        RAJA_TEAM_SHARED double sBG[MQ1*MD1]; \
+        double (*B)[MD1] = (double (*)[MD1]) sBG; \
+        double (*G)[MD1] = (double (*)[MD1]) sBG; \
+        double (*Bt)[MQ1] = (double (*)[MQ1]) sBG; \
+        double (*Gt)[MQ1] = (double (*)[MQ1]) sBG; \
+        RAJA_TEAM_SHARED double sm0[3][MDQ*MDQ*MDQ]; \
+        RAJA_TEAM_SHARED double sm1[3][MDQ*MDQ*MDQ]; \
+        double (*s_X)[MD1][MD1]    = (double (*)[MD1][MD1]) (sm0+2); \
+        double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0); \
+        double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1); \
+        double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0); \
+        double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1); \
+        double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2); \
+        double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0); \
+        double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1); \
+        double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2); \
+        double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0); \
+        double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1); \
+        double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2); \
+        double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0); \
+        double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1); \
+        double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2);
+
+#define DIFFUSION3DPA_0_CPU \
+        constexpr int MQ1 = DPA_Q1D; \
+        constexpr int MD1 = DPA_D1D; \
+        constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \
+        double sBG[MQ1*MD1]; \
+        double (*B)[MD1] = (double (*)[MD1]) sBG; \
+        double (*G)[MD1] = (double (*)[MD1]) sBG; \
+        double (*Bt)[MQ1] = (double (*)[MQ1]) sBG; \
+        double (*Gt)[MQ1] = (double (*)[MQ1]) sBG; \
+        double sm0[3][MDQ*MDQ*MDQ]; \
+        double sm1[3][MDQ*MDQ*MDQ]; \
+        double (*s_X)[MD1][MD1]    = (double (*)[MD1][MD1]) (sm0+2); \
+        double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0); \
+        double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1); \
+        double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0); \
+        double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1); \
+        double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2); \
+        double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0); \
+        double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1); \
+        double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2); \
+        double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0); \
+        double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1); \
+        double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2); \
+        double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0); \
+        double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1); \
+        double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2);
+
+#define DIFFUSION3DPA_1 \
+        s_X[dz][dy][dx] = dpaX_(dx,dy,dz,e);
+
+#define DIFFUSION3DPA_2 \
+        const int i = qi(qx,dy,DPA_Q1D); \
+        const int j = dj(qx,dy,DPA_D1D); \
+        const int k = qk(qx,dy,DPA_Q1D); \
+        const int l = dl(qx,dy,DPA_D1D); \
+        B[i][j] = b(qx,dy); \
+        G[k][l] = g(qx,dy) * sign(qx,dy); \
+
+#define DIFFUSION3DPA_3 \
+           double u = 0.0, v = 0.0; \
+            RAJAPERF_UNROLL(MD1) \
+            for (int dx = 0; dx < DPA_D1D; ++dx) \
+            { \
+               const int i = qi(qx,dx,DPA_Q1D); \
+               const int j = dj(qx,dx,DPA_D1D); \
+               const int k = qk(qx,dx,DPA_Q1D); \
+               const int l = dl(qx,dx,DPA_D1D); \
+               const double s = sign(qx,dx); \
+               const double coords = s_X[dz][dy][dx]; \
+               u += coords * B[i][j]; \
+               v += coords * G[k][l] * s; \
+             } \
+             DDQ0[dz][dy][qx] = u; \
+             DDQ1[dz][dy][qx] = v;
+
+#define DIFFUSION3DPA_4 \
+   double u = 0.0, v = 0.0, w = 0.0; \
+   RAJAPERF_UNROLL(MD1)  \
+   for (int dy = 0; dy < DPA_D1D; ++dy) \
+   { \
+      const int i = qi(qy,dy,DPA_Q1D); \
+      const int j = dj(qy,dy,DPA_D1D); \
+      const int k = qk(qy,dy,DPA_Q1D); \
+      const int l = dl(qy,dy,DPA_D1D); \
+      const double s = sign(qy,dy); \
+      u += DDQ1[dz][dy][qx] * B[i][j]; \
+      v += DDQ0[dz][dy][qx] * G[k][l] * s; \
+      w += DDQ0[dz][dy][qx] * B[i][j]; \
+   } \
+   DQQ0[dz][qy][qx] = u; \
+   DQQ1[dz][qy][qx] = v; \
+   DQQ2[dz][qy][qx] = w;
+
+#define DIFFUSION3DPA_5 \
+               double u = 0.0, v = 0.0, w = 0.0; \
+               RAJAPERF_UNROLL(MD1) \
+               for (int dz = 0; dz < DPA_D1D; ++dz) \
+               { \
+                  const int i = qi(qz,dz,DPA_Q1D); \
+                  const int j = dj(qz,dz,DPA_D1D); \
+                  const int k = qk(qz,dz,DPA_Q1D); \
+                  const int l = dl(qz,dz,DPA_D1D); \
+                  const double s = sign(qz,dz); \
+                  u += DQQ0[dz][qy][qx] * B[i][j]; \
+                  v += DQQ1[dz][qy][qx] * B[i][j]; \
+                  w += DQQ2[dz][qy][qx] * G[k][l] * s; \
+               } \
+               const double O11 = d(qx,qy,qz,0,e); \
+               const double O12 = d(qx,qy,qz,1,e); \
+               const double O13 = d(qx,qy,qz,2,e); \
+               const double O21 = symmetric ? O12 : d(qx,qy,qz,3,e); \
+               const double O22 = symmetric ? d(qx,qy,qz,3,e) : d(qx,qy,qz,4,e); \
+               const double O23 = symmetric ? d(qx,qy,qz,4,e) : d(qx,qy,qz,5,e); \
+               const double O31 = symmetric ? O13 : d(qx,qy,qz,6,e); \
+               const double O32 = symmetric ? O23 : d(qx,qy,qz,7,e); \
+               const double O33 = symmetric ? d(qx,qy,qz,5,e) : d(qx,qy,qz,8,e); \
+               const double gX = u; \
+               const double gY = v; \
+               const double gZ = w; \
+               QQQ0[qz][qy][qx] = (O11*gX) + (O12*gY) + (O13*gZ); \
+               QQQ1[qz][qy][qx] = (O21*gX) + (O22*gY) + (O23*gZ); \
+               QQQ2[qz][qy][qx] = (O31*gX) + (O32*gY) + (O33*gZ);
+
+#define DIFFUSION3DPA_6 \
+               const int i = qi(q,d,DPA_Q1D); \
+               const int j = dj(q,d,DPA_D1D); \
+               const int k = qk(q,d,DPA_Q1D); \
+               const int l = dl(q,d,DPA_D1D); \
+               Bt[j][i] = b(q,d); \
+               Gt[l][k] = g(q,d) * sign(q,d);
+
+#define DIFFUSION3DPA_7 \
+            double u = 0.0, v = 0.0, w = 0.0; \
+            RAJAPERF_UNROLL(MQ1) \
+            for (int qx = 0; qx < DPA_Q1D; ++qx) \
+            { \
+              const int i = qi(qx,dx,DPA_Q1D); \
+              const int j = dj(qx,dx,DPA_D1D); \
+              const int k = qk(qx,dx,DPA_Q1D); \
+              const int l = dl(qx,dx,DPA_D1D); \
+              const double s = sign(qx,dx); \
+              u += QQQ0[qz][qy][qx] * Gt[l][k] * s; \
+              v += QQQ1[qz][qy][qx] * Bt[j][i]; \
+              w += QQQ2[qz][qy][qx] * Bt[j][i]; \
+            } \
+            QQD0[qz][qy][dx] = u; \
+            QQD1[qz][qy][dx] = v; \
+            QQD2[qz][qy][dx] = w;
+
+#define DIFFUSION3DPA_8 \
+        double u = 0.0, v = 0.0, w = 0.0; \
+        RAJAPERF_UNROLL(DPA_Q1D)  \
+        for (int qy = 0; qy < DPA_Q1D; ++qy) \
+        { \
+          const int i = qi(qy,dy,DPA_Q1D); \
+          const int j = dj(qy,dy,DPA_D1D); \
+          const int k = qk(qy,dy,DPA_Q1D); \
+          const int l = dl(qy,dy,DPA_D1D); \
+          const double s = sign(qy,dy); \
+          u += QQD0[qz][qy][dx] * Bt[j][i]; \
+          v += QQD1[qz][qy][dx] * Gt[l][k] * s; \
+          w += QQD2[qz][qy][dx] * Bt[j][i]; \
+        } \
+        QDD0[qz][dy][dx] = u; \
+        QDD1[qz][dy][dx] = v; \
+        QDD2[qz][dy][dx] = w;
+
+#define DIFFUSION3DPA_9 \
+        double u = 0.0, v = 0.0, w = 0.0; \
+        RAJAPERF_UNROLL(MQ1) \
+        for (int qz = 0; qz < DPA_Q1D; ++qz)  \
+        {                                     \
+          const int i = qi(qz,dz,DPA_Q1D); \
+          const int j = dj(qz,dz,DPA_D1D); \
+          const int k = qk(qz,dz,DPA_Q1D); \
+          const int l = dl(qz,dz,DPA_D1D); \
+          const double s = sign(qz,dz);    \
+          u += QDD0[qz][dy][dx] * Bt[j][i];     \
+          v += QDD1[qz][dy][dx] * Bt[j][i];     \
+          w += QDD2[qz][dy][dx] * Gt[l][k] * s; \
+        }                                       \
+        dpaY_(dx,dy,dz,e) += (u + v + w);
+
+namespace rajaperf
+{
+class RunParams;
+
+namespace apps
+{
+
+class DIFFUSION3DPA : public KernelBase
+{
+public:
+
+  DIFFUSION3DPA(const RunParams& params);
+
+  ~DIFFUSION3DPA();
+
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
+
+private:
+  static const size_t default_gpu_block_size = DPA_Q1D * DPA_Q1D * DPA_Q1D;
+  using gpu_block_sizes_type = gpu_block_size::list_type<default_gpu_block_size>;
+
+  Real_ptr m_B;
+  Real_ptr m_Bt;
+  Real_ptr m_G;
+  Real_ptr m_Gt;
+  Real_ptr m_D;
+  Real_ptr m_X;
+  Real_ptr m_Y;
+
+  Index_type m_NE;
+  Index_type m_NE_default;
+};
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard
diff --git a/src/apps/ENERGY-Cuda.cpp b/src/apps/ENERGY-Cuda.cpp
index e8a97b99c..a99a928e3 100644
--- a/src/apps/ENERGY-Cuda.cpp
+++ b/src/apps/ENERGY-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace apps
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define ENERGY_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(e_new, m_e_new, iend); \
   allocAndInitCudaDeviceData(e_old, m_e_old, iend); \
@@ -63,16 +57,20 @@ namespace apps
   deallocCudaDeviceData(qq_old); \
   deallocCudaDeviceData(vnewc);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void energycalc1(Real_ptr e_new, Real_ptr e_old, Real_ptr delvc,
                             Real_ptr p_old, Real_ptr q_old, Real_ptr work,
                             Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      ENERGY_BODY1;
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void energycalc2(Real_ptr delvc, Real_ptr q_new,
                             Real_ptr compHalfStep, Real_ptr pHalfStep,
                             Real_ptr e_new, Real_ptr bvc, Real_ptr pbvc,
@@ -80,33 +78,39 @@ __global__ void energycalc2(Real_ptr delvc, Real_ptr q_new,
                             Real_type rho0,
                             Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      ENERGY_BODY2;
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void energycalc3(Real_ptr e_new, Real_ptr delvc,
                             Real_ptr p_old, Real_ptr q_old,
                             Real_ptr pHalfStep, Real_ptr q_new,
                             Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      ENERGY_BODY3;
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void energycalc4(Real_ptr e_new, Real_ptr work,
                             Real_type e_cut, Real_type emin,
                             Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      ENERGY_BODY4;
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void energycalc5(Real_ptr delvc,
                             Real_ptr pbvc, Real_ptr e_new, Real_ptr vnewc,
                             Real_ptr bvc, Real_ptr p_new,
@@ -116,12 +120,14 @@ __global__ void energycalc5(Real_ptr delvc,
                             Real_type rho0, Real_type e_cut, Real_type emin,
                             Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      ENERGY_BODY5;
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void energycalc6(Real_ptr delvc,
                             Real_ptr pbvc, Real_ptr e_new, Real_ptr vnewc,
                             Real_ptr bvc, Real_ptr p_new,
@@ -130,14 +136,15 @@ __global__ void energycalc6(Real_ptr delvc,
                             Real_type rho0, Real_type q_cut,
                             Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      ENERGY_BODY6;
    }
 }
 
 
-void ENERGY::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void ENERGY::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -154,12 +161,12 @@ void ENERGY::runCudaVariant(VariantID vid)
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
 
-       energycalc1<<<grid_size, block_size>>>( e_new, e_old, delvc,
+       energycalc1<block_size><<<grid_size, block_size>>>( e_new, e_old, delvc,
                                                p_old, q_old, work,
                                                iend );
        cudaErrchk( cudaGetLastError() );
 
-       energycalc2<<<grid_size, block_size>>>( delvc, q_new,
+       energycalc2<block_size><<<grid_size, block_size>>>( delvc, q_new,
                                                compHalfStep, pHalfStep,
                                                e_new, bvc, pbvc,
                                                ql_old, qq_old,
@@ -167,18 +174,18 @@ void ENERGY::runCudaVariant(VariantID vid)
                                                iend );
        cudaErrchk( cudaGetLastError() );
 
-       energycalc3<<<grid_size, block_size>>>( e_new, delvc,
+       energycalc3<block_size><<<grid_size, block_size>>>( e_new, delvc,
                                                p_old, q_old,
                                                pHalfStep, q_new,
                                                iend );
        cudaErrchk( cudaGetLastError() );
 
-       energycalc4<<<grid_size, block_size>>>( e_new, work,
+       energycalc4<block_size><<<grid_size, block_size>>>( e_new, work,
                                                e_cut, emin,
                                                iend );
        cudaErrchk( cudaGetLastError() );
 
-       energycalc5<<<grid_size, block_size>>>( delvc,
+       energycalc5<block_size><<<grid_size, block_size>>>( delvc,
                                                pbvc, e_new, vnewc,
                                                bvc, p_new,
                                                ql_old, qq_old,
@@ -188,7 +195,7 @@ void ENERGY::runCudaVariant(VariantID vid)
                                                iend );
        cudaErrchk( cudaGetLastError() );
 
-       energycalc6<<<grid_size, block_size>>>( delvc,
+       energycalc6<block_size><<<grid_size, block_size>>>( delvc,
                                                pbvc, e_new, vnewc,
                                                bvc, p_new,
                                                q_new,
@@ -257,10 +264,12 @@ void ENERGY::runCudaVariant(VariantID vid)
     ENERGY_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  ENERGY : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  ENERGY : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(ENERGY, Cuda)
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/ENERGY-Hip.cpp b/src/apps/ENERGY-Hip.cpp
index 96a1c759a..e7e882cff 100644
--- a/src/apps/ENERGY-Hip.cpp
+++ b/src/apps/ENERGY-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace apps
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define ENERGY_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(e_new, m_e_new, iend); \
   allocAndInitHipDeviceData(e_old, m_e_old, iend); \
@@ -63,16 +57,20 @@ namespace apps
   deallocHipDeviceData(qq_old); \
   deallocHipDeviceData(vnewc);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void energycalc1(Real_ptr e_new, Real_ptr e_old, Real_ptr delvc,
                             Real_ptr p_old, Real_ptr q_old, Real_ptr work,
                             Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      ENERGY_BODY1;
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void energycalc2(Real_ptr delvc, Real_ptr q_new,
                             Real_ptr compHalfStep, Real_ptr pHalfStep,
                             Real_ptr e_new, Real_ptr bvc, Real_ptr pbvc,
@@ -80,33 +78,39 @@ __global__ void energycalc2(Real_ptr delvc, Real_ptr q_new,
                             Real_type rho0,
                             Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      ENERGY_BODY2;
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void energycalc3(Real_ptr e_new, Real_ptr delvc,
                             Real_ptr p_old, Real_ptr q_old,
                             Real_ptr pHalfStep, Real_ptr q_new,
                             Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      ENERGY_BODY3;
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void energycalc4(Real_ptr e_new, Real_ptr work,
                             Real_type e_cut, Real_type emin,
                             Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      ENERGY_BODY4;
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void energycalc5(Real_ptr delvc,
                             Real_ptr pbvc, Real_ptr e_new, Real_ptr vnewc,
                             Real_ptr bvc, Real_ptr p_new,
@@ -116,12 +120,14 @@ __global__ void energycalc5(Real_ptr delvc,
                             Real_type rho0, Real_type e_cut, Real_type emin,
                             Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      ENERGY_BODY5;
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void energycalc6(Real_ptr delvc,
                             Real_ptr pbvc, Real_ptr e_new, Real_ptr vnewc,
                             Real_ptr bvc, Real_ptr p_new,
@@ -130,14 +136,15 @@ __global__ void energycalc6(Real_ptr delvc,
                             Real_type rho0, Real_type q_cut,
                             Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      ENERGY_BODY6;
    }
 }
 
 
-void ENERGY::runHipVariant(VariantID vid)
+template < size_t block_size >
+void ENERGY::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -154,12 +161,12 @@ void ENERGY::runHipVariant(VariantID vid)
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
 
-       hipLaunchKernelGGL((energycalc1), dim3(grid_size), dim3(block_size), 0, 0,  e_new, e_old, delvc,
+       hipLaunchKernelGGL((energycalc1<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  e_new, e_old, delvc,
                                                p_old, q_old, work,
                                                iend );
        hipErrchk( hipGetLastError() );
 
-       hipLaunchKernelGGL((energycalc2), dim3(grid_size), dim3(block_size), 0, 0,  delvc, q_new,
+       hipLaunchKernelGGL((energycalc2<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  delvc, q_new,
                                                compHalfStep, pHalfStep,
                                                e_new, bvc, pbvc,
                                                ql_old, qq_old,
@@ -167,18 +174,18 @@ void ENERGY::runHipVariant(VariantID vid)
                                                iend );
        hipErrchk( hipGetLastError() );
 
-       hipLaunchKernelGGL((energycalc3), dim3(grid_size), dim3(block_size), 0, 0,  e_new, delvc,
+       hipLaunchKernelGGL((energycalc3<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  e_new, delvc,
                                                p_old, q_old,
                                                pHalfStep, q_new,
                                                iend );
        hipErrchk( hipGetLastError() );
 
-       hipLaunchKernelGGL((energycalc4), dim3(grid_size), dim3(block_size), 0, 0,  e_new, work,
+       hipLaunchKernelGGL((energycalc4<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  e_new, work,
                                                e_cut, emin,
                                                iend );
        hipErrchk( hipGetLastError() );
 
-       hipLaunchKernelGGL((energycalc5), dim3(grid_size), dim3(block_size), 0, 0,  delvc,
+       hipLaunchKernelGGL((energycalc5<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  delvc,
                                                pbvc, e_new, vnewc,
                                                bvc, p_new,
                                                ql_old, qq_old,
@@ -188,7 +195,7 @@ void ENERGY::runHipVariant(VariantID vid)
                                                iend );
        hipErrchk( hipGetLastError() );
 
-       hipLaunchKernelGGL((energycalc6), dim3(grid_size), dim3(block_size), 0, 0,  delvc,
+       hipLaunchKernelGGL((energycalc6<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  delvc,
                                                pbvc, e_new, vnewc,
                                                bvc, p_new,
                                                q_new,
@@ -251,10 +258,12 @@ void ENERGY::runHipVariant(VariantID vid)
     ENERGY_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  ENERGY : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  ENERGY : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(ENERGY, Hip)
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/ENERGY-OMP.cpp b/src/apps/ENERGY-OMP.cpp
index 60c907822..f06c2efe9 100644
--- a/src/apps/ENERGY-OMP.cpp
+++ b/src/apps/ENERGY-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
 
 
-void ENERGY::runOpenMPVariant(VariantID vid)
+void ENERGY::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -27,7 +27,7 @@ void ENERGY::runOpenMPVariant(VariantID vid)
   const Index_type iend = getActualProblemSize();
 
   ENERGY_DATA_SETUP;
-  
+
   auto energy_lam1 = [=](Index_type i) {
                        ENERGY_BODY1;
                      };
@@ -93,7 +93,7 @@ void ENERGY::runOpenMPVariant(VariantID vid)
 
       break;
     }
-  
+
     case Lambda_OpenMP : {
 
       startTimer();
@@ -154,16 +154,16 @@ void ENERGY::runOpenMPVariant(VariantID vid)
 
           RAJA::forall< RAJA::omp_for_nowait_static_exec< > >(
             RAJA::RangeSegment(ibegin, iend), energy_lam3);
-  
+
           RAJA::forall< RAJA::omp_for_nowait_static_exec< > >(
             RAJA::RangeSegment(ibegin, iend), energy_lam4);
-  
+
           RAJA::forall< RAJA::omp_for_nowait_static_exec< > >(
             RAJA::RangeSegment(ibegin, iend), energy_lam5);
-  
+
           RAJA::forall< RAJA::omp_for_nowait_static_exec< > >(
             RAJA::RangeSegment(ibegin, iend), energy_lam6);
-  
+
         }); // end omp parallel region
 
       }
@@ -172,12 +172,12 @@ void ENERGY::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  ENERGY : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  ENERGY : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/apps/ENERGY-OMPTarget.cpp b/src/apps/ENERGY-OMPTarget.cpp
index 7478e7dd8..3027bd25f 100644
--- a/src/apps/ENERGY-OMPTarget.cpp
+++ b/src/apps/ENERGY-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
@@ -65,7 +65,7 @@ namespace apps
   deallocOpenMPDeviceData(qq_old, did); \
   deallocOpenMPDeviceData(vnewc, did);
 
-void ENERGY::runOpenMPTargetVariant(VariantID vid)
+void ENERGY::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -123,7 +123,7 @@ void ENERGY::runOpenMPTargetVariant(VariantID vid)
       for (Index_type i = ibegin; i < iend; ++i ) {
         ENERGY_BODY6;
       }
-        
+
     }
     stopTimer();
 
@@ -157,7 +157,7 @@ void ENERGY::runOpenMPTargetVariant(VariantID vid)
           RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           ENERGY_BODY4;
         });
-  
+
         RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
           RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           ENERGY_BODY5;
@@ -176,7 +176,7 @@ void ENERGY::runOpenMPTargetVariant(VariantID vid)
     ENERGY_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  ENERGY : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  ENERGY : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/apps/ENERGY-Seq.cpp b/src/apps/ENERGY-Seq.cpp
index c6b69d0af..5bc229c6e 100644
--- a/src/apps/ENERGY-Seq.cpp
+++ b/src/apps/ENERGY-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,20 +12,20 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
 
 
-void ENERGY::runSeqVariant(VariantID vid)
+void ENERGY::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
 
   ENERGY_DATA_SETUP;
-  
+
   auto energy_lam1 = [=](Index_type i) {
                        ENERGY_BODY1;
                      };
@@ -67,7 +67,7 @@ void ENERGY::runSeqVariant(VariantID vid)
         for (Index_type i = ibegin; i < iend; ++i ) {
           ENERGY_BODY4;
         }
-  
+
         for (Index_type i = ibegin; i < iend; ++i ) {
           ENERGY_BODY5;
         }
@@ -80,7 +80,7 @@ void ENERGY::runSeqVariant(VariantID vid)
       stopTimer();
 
       break;
-    } 
+    }
 
 #if defined(RUN_RAJA_SEQ)
     case Lambda_Seq : {
@@ -146,14 +146,14 @@ void ENERGY::runSeqVariant(VariantID vid)
         }); // end sequential region (for single-source code)
 
       }
-      stopTimer(); 
+      stopTimer();
 
       break;
     }
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  ENERGY : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  ENERGY : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp
index a6a779f8c..9ed11381a 100644
--- a/src/apps/ENERGY.cpp
+++ b/src/apps/ENERGY.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -68,7 +68,7 @@ ENERGY::~ENERGY()
 {
 }
 
-void ENERGY::setUp(VariantID vid)
+void ENERGY::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_e_new, getActualProblemSize(), 0.0, vid);
   allocAndInitData(m_e_old, getActualProblemSize(), vid);
@@ -92,13 +92,13 @@ void ENERGY::setUp(VariantID vid)
   initData(m_q_cut);
 }
 
-void ENERGY::updateChecksum(VariantID vid)
+void ENERGY::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_e_new, getActualProblemSize());
-  checksum[vid] += calcChecksum(m_q_new, getActualProblemSize());
+  checksum[vid][tune_idx] += calcChecksum(m_e_new, getActualProblemSize());
+  checksum[vid][tune_idx] += calcChecksum(m_q_new, getActualProblemSize());
 }
 
-void ENERGY::tearDown(VariantID vid)
+void ENERGY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
 
diff --git a/src/apps/ENERGY.hpp b/src/apps/ENERGY.hpp
index 00a45de1d..6461fdd5f 100644
--- a/src/apps/ENERGY.hpp
+++ b/src/apps/ENERGY.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -194,17 +194,27 @@ class ENERGY : public KernelBase
 
   ~ENERGY();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_e_new;
   Real_ptr m_e_old;
   Real_ptr m_delvc;
diff --git a/src/apps/FEM_MACROS.hpp b/src/apps/FEM_MACROS.hpp
new file mode 100644
index 000000000..474ada22b
--- /dev/null
+++ b/src/apps/FEM_MACROS.hpp
@@ -0,0 +1,29 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+#ifndef RAJAPerf_FEM_MACROS_HPP
+#define RAJAPerf_FEM_MACROS_HPP
+
+#if defined(USE_RAJAPERF_UNROLL)
+// If enabled uses RAJA's RAJA_UNROLL_COUNT which is always on
+#define RAJAPERF_UNROLL(N) RAJA_UNROLL_COUNT(N)
+#else
+#define RAJAPERF_UNROLL(N)
+#endif
+
+// Need two different host/device macros due to
+// how hipcc/clang works.
+// See note in MAT_MAT_SHARED regarding hipcc/clang
+// builds.
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+#define GPU_FOREACH_THREAD(i, k, N)                    \
+  for (int i = threadIdx.k; i < N; i += blockDim.k)
+#endif
+
+#define CPU_FOREACH(i, k, N) for (int i = 0; i < N; i++)
+
+#endif // closing endif for header file include guard
diff --git a/src/apps/FIR-Cuda.cpp b/src/apps/FIR-Cuda.cpp
index baff493ac..4dea7c82e 100644
--- a/src/apps/FIR-Cuda.cpp
+++ b/src/apps/FIR-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -25,12 +25,6 @@ namespace apps
 #define USE_CUDA_CONSTANT_MEMORY
 //#undef USE_CUDA_CONSTANT_MEMORY
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #if defined(USE_CUDA_CONSTANT_MEMORY)
 
 __constant__ Real_type coeff[FIR_COEFFLEN];
@@ -46,11 +40,13 @@ __constant__ Real_type coeff[FIR_COEFFLEN];
   deallocCudaDeviceData(in); \
   deallocCudaDeviceData(out);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void fir(Real_ptr out, Real_ptr in,
                     const Index_type coefflen,
                     Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      FIR_BODY;
    }
@@ -73,12 +69,14 @@ __global__ void fir(Real_ptr out, Real_ptr in,
   deallocCudaDeviceData(out); \
   deallocCudaDeviceData(coeff);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void fir(Real_ptr out, Real_ptr in,
                     Real_ptr coeff,
                     const Index_type coefflen,
                     Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      FIR_BODY;
    }
@@ -87,7 +85,8 @@ __global__ void fir(Real_ptr out, Real_ptr in,
 #endif
 
 
-void FIR::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void FIR::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -107,12 +106,12 @@ void FIR::runCudaVariant(VariantID vid)
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
 
 #if defined(USE_CUDA_CONSTANT_MEMORY)
-       fir<<<grid_size, block_size>>>( out, in,
+       fir<block_size><<<grid_size, block_size>>>( out, in,
                                        coefflen,
                                        iend );
        cudaErrchk( cudaGetLastError() );
 #else
-       fir<<<grid_size, block_size>>>( out, in,
+       fir<block_size><<<grid_size, block_size>>>( out, in,
                                        coeff,
                                        coefflen,
                                        iend );
@@ -144,10 +143,12 @@ void FIR::runCudaVariant(VariantID vid)
     FIR_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  FIR : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  FIR : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(FIR, Cuda)
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/FIR-Hip.cpp b/src/apps/FIR-Hip.cpp
index 4408a714c..42e3503e0 100644
--- a/src/apps/FIR-Hip.cpp
+++ b/src/apps/FIR-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -25,12 +25,6 @@ namespace apps
 #define USE_HIP_CONSTANT_MEMORY
 // #undef USE_HIP_CONSTANT_MEMORY
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #if defined(USE_HIP_CONSTANT_MEMORY)
 
 __constant__ Real_type coeff[FIR_COEFFLEN];
@@ -46,11 +40,13 @@ __constant__ Real_type coeff[FIR_COEFFLEN];
   deallocHipDeviceData(in); \
   deallocHipDeviceData(out);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void fir(Real_ptr out, Real_ptr in,
                     const Index_type coefflen,
                     Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      FIR_BODY;
    }
@@ -73,12 +69,14 @@ __global__ void fir(Real_ptr out, Real_ptr in,
   deallocHipDeviceData(out); \
   deallocHipDeviceData(coeff);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void fir(Real_ptr out, Real_ptr in,
                     Real_ptr coeff,
                     const Index_type coefflen,
                     Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      FIR_BODY;
    }
@@ -87,7 +85,8 @@ __global__ void fir(Real_ptr out, Real_ptr in,
 #endif
 
 
-void FIR::runHipVariant(VariantID vid)
+template < size_t block_size >
+void FIR::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -107,12 +106,12 @@ void FIR::runHipVariant(VariantID vid)
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
 
 #if defined(USE_HIP_CONSTANT_MEMORY)
-       hipLaunchKernelGGL((fir), dim3(grid_size), dim3(block_size), 0, 0,  out, in,
+       hipLaunchKernelGGL((fir<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  out, in,
                                        coefflen,
                                        iend );
        hipErrchk( hipGetLastError() );
 #else
-       hipLaunchKernelGGL((fir), dim3(grid_size), dim3(block_size), 0, 0,  out, in,
+       hipLaunchKernelGGL((fir<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  out, in,
                                        coeff,
                                        coefflen,
                                        iend );
@@ -144,10 +143,12 @@ void FIR::runHipVariant(VariantID vid)
     FIR_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  FIR : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  FIR : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(FIR, Hip)
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/FIR-OMP.cpp b/src/apps/FIR-OMP.cpp
index 10928fe76..7a5415130 100644
--- a/src/apps/FIR-OMP.cpp
+++ b/src/apps/FIR-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,13 +13,13 @@
 #include <algorithm>
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
 
 
-void FIR::runOpenMPVariant(VariantID vid)
+void FIR::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -37,7 +37,7 @@ void FIR::runOpenMPVariant(VariantID vid)
   auto fir_lam = [=](Index_type i) {
                    FIR_BODY;
                  };
-  
+
   switch ( vid ) {
 
     case Base_OpenMP : {
@@ -87,12 +87,12 @@ void FIR::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  FIR : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  FIR : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/apps/FIR-OMPTarget.cpp b/src/apps/FIR-OMPTarget.cpp
index 3103cfd67..90be7bd3a 100644
--- a/src/apps/FIR-OMPTarget.cpp
+++ b/src/apps/FIR-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -17,7 +17,7 @@
 #include <algorithm>
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
@@ -46,7 +46,7 @@ namespace apps
   deallocOpenMPDeviceData(coeff, did);
 
 
-void FIR::runOpenMPTargetVariant(VariantID vid)
+void FIR::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -94,7 +94,7 @@ void FIR::runOpenMPTargetVariant(VariantID vid)
     FIR_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  FIR : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  FIR : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/apps/FIR-Seq.cpp b/src/apps/FIR-Seq.cpp
index cd10b7069..3a196a1f1 100644
--- a/src/apps/FIR-Seq.cpp
+++ b/src/apps/FIR-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,13 +13,13 @@
 #include <algorithm>
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
 
 
-void FIR::runSeqVariant(VariantID vid)
+void FIR::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -35,7 +35,7 @@ void FIR::runSeqVariant(VariantID vid)
   auto fir_lam = [=](Index_type i) {
                    FIR_BODY;
                  };
-  
+
   switch ( vid ) {
 
     case Base_Seq : {
@@ -51,7 +51,7 @@ void FIR::runSeqVariant(VariantID vid)
       stopTimer();
 
       break;
-    } 
+    }
 
 #if defined(RUN_RAJA_SEQ)
     case Lambda_Seq : {
@@ -78,14 +78,14 @@ void FIR::runSeqVariant(VariantID vid)
           RAJA::RangeSegment(ibegin, iend), fir_lam);
 
       }
-      stopTimer(); 
+      stopTimer();
 
       break;
     }
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  FIR : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  FIR : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp
index fe3993cd9..8dd25358e 100644
--- a/src/apps/FIR.cpp
+++ b/src/apps/FIR.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -33,7 +33,7 @@ FIR::FIR(const RunParams& params)
   setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() +
                   (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() );
   setFLOPsPerRep((2 * m_coefflen) * (getActualProblemSize() - m_coefflen));
- 
+
   checksum_scale_factor = 0.0001 *
               ( static_cast<Checksum_type>(getDefaultProblemSize()) /
                                            getActualProblemSize() );
@@ -62,18 +62,18 @@ FIR::~FIR()
 {
 }
 
-void FIR::setUp(VariantID vid)
+void FIR::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitData(m_in, getActualProblemSize(), vid);
   allocAndInitDataConst(m_out, getActualProblemSize(), 0.0, vid);
 }
 
-void FIR::updateChecksum(VariantID vid)
+void FIR::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_out, getActualProblemSize(), checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_out, getActualProblemSize(), checksum_scale_factor );
 }
 
-void FIR::tearDown(VariantID vid)
+void FIR::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
 
diff --git a/src/apps/FIR.hpp b/src/apps/FIR.hpp
index e9b49edcb..dd46d9934 100644
--- a/src/apps/FIR.hpp
+++ b/src/apps/FIR.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -69,17 +69,27 @@ class FIR : public KernelBase
 
   ~FIR();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_in;
   Real_ptr m_out;
 
diff --git a/src/apps/HALOEXCHANGE-Cuda.cpp b/src/apps/HALOEXCHANGE-Cuda.cpp
index 4633b4a7f..cab4f911d 100644
--- a/src/apps/HALOEXCHANGE-Cuda.cpp
+++ b/src/apps/HALOEXCHANGE-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace apps
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define HALOEXCHANGE_DATA_SETUP_CUDA \
   for (Index_type v = 0; v < m_num_vars; ++v) { \
     allocAndInitCudaDeviceData(vars[v], m_vars[v], m_var_size); \
@@ -48,20 +42,24 @@ namespace apps
     deallocCudaDeviceData(vars[v]); \
   }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var,
                                   Index_type len)
 {
-   Index_type i = threadIdx.x + blockIdx.x * blockDim.x;
+   Index_type i = threadIdx.x + blockIdx.x * block_size;
 
    if (i < len) {
      HALOEXCHANGE_PACK_BODY;
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var,
                                     Index_type len)
 {
-   Index_type i = threadIdx.x + blockIdx.x * blockDim.x;
+   Index_type i = threadIdx.x + blockIdx.x * block_size;
 
    if (i < len) {
      HALOEXCHANGE_UNPACK_BODY;
@@ -69,7 +67,8 @@ __global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var,
 }
 
 
-void HALOEXCHANGE::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void HALOEXCHANGE::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -90,7 +89,7 @@ void HALOEXCHANGE::runCudaVariant(VariantID vid)
           Real_ptr var = vars[v];
           dim3 nthreads_per_block(block_size);
           dim3 nblocks((len + block_size-1) / block_size);
-          haloexchange_pack<<<nblocks, nthreads_per_block>>>(buffer, list, var, len);
+          haloexchange_pack<block_size><<<nblocks, nthreads_per_block>>>(buffer, list, var, len);
           cudaErrchk( cudaGetLastError() );
           buffer += len;
         }
@@ -105,7 +104,7 @@ void HALOEXCHANGE::runCudaVariant(VariantID vid)
           Real_ptr var = vars[v];
           dim3 nthreads_per_block(block_size);
           dim3 nblocks((len + block_size-1) / block_size);
-          haloexchange_unpack<<<nblocks, nthreads_per_block>>>(buffer, list, var, len);
+          haloexchange_unpack<block_size><<<nblocks, nthreads_per_block>>>(buffer, list, var, len);
           cudaErrchk( cudaGetLastError() );
           buffer += len;
         }
@@ -166,10 +165,12 @@ void HALOEXCHANGE::runCudaVariant(VariantID vid)
     HALOEXCHANGE_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n HALOEXCHANGE : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n HALOEXCHANGE : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(HALOEXCHANGE, Cuda)
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/HALOEXCHANGE-Hip.cpp b/src/apps/HALOEXCHANGE-Hip.cpp
index fd6fac040..4070edc72 100644
--- a/src/apps/HALOEXCHANGE-Hip.cpp
+++ b/src/apps/HALOEXCHANGE-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace apps
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define HALOEXCHANGE_DATA_SETUP_HIP \
   for (Index_type v = 0; v < m_num_vars; ++v) { \
     allocAndInitHipDeviceData(vars[v], m_vars[v], m_var_size); \
@@ -48,20 +42,24 @@ namespace apps
     deallocHipDeviceData(vars[v]); \
   }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var,
                                   Index_type len)
 {
-   Index_type i = threadIdx.x + blockIdx.x * blockDim.x;
+   Index_type i = threadIdx.x + blockIdx.x * block_size;
 
    if (i < len) {
      HALOEXCHANGE_PACK_BODY;
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var,
                                     Index_type len)
 {
-   Index_type i = threadIdx.x + blockIdx.x * blockDim.x;
+   Index_type i = threadIdx.x + blockIdx.x * block_size;
 
    if (i < len) {
      HALOEXCHANGE_UNPACK_BODY;
@@ -69,7 +67,8 @@ __global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var,
 }
 
 
-void HALOEXCHANGE::runHipVariant(VariantID vid)
+template < size_t block_size >
+void HALOEXCHANGE::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -90,7 +89,7 @@ void HALOEXCHANGE::runHipVariant(VariantID vid)
           Real_ptr var = vars[v];
           dim3 nthreads_per_block(block_size);
           dim3 nblocks((len + block_size-1) / block_size);
-          hipLaunchKernelGGL((haloexchange_pack), nblocks, nthreads_per_block, 0, 0,
+          hipLaunchKernelGGL((haloexchange_pack<block_size>), nblocks, nthreads_per_block, 0, 0,
               buffer, list, var, len);
           hipErrchk( hipGetLastError() );
           buffer += len;
@@ -106,7 +105,7 @@ void HALOEXCHANGE::runHipVariant(VariantID vid)
           Real_ptr var = vars[v];
           dim3 nthreads_per_block(block_size);
           dim3 nblocks((len + block_size-1) / block_size);
-          hipLaunchKernelGGL((haloexchange_unpack), nblocks, nthreads_per_block, 0, 0,
+          hipLaunchKernelGGL((haloexchange_unpack<block_size>), nblocks, nthreads_per_block, 0, 0,
               buffer, list, var, len);
           hipErrchk( hipGetLastError() );
           buffer += len;
@@ -168,10 +167,12 @@ void HALOEXCHANGE::runHipVariant(VariantID vid)
     HALOEXCHANGE_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n HALOEXCHANGE : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n HALOEXCHANGE : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(HALOEXCHANGE, Hip)
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/HALOEXCHANGE-OMP.cpp b/src/apps/HALOEXCHANGE-OMP.cpp
index 70e9419a8..daa1dbad8 100644
--- a/src/apps/HALOEXCHANGE-OMP.cpp
+++ b/src/apps/HALOEXCHANGE-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,7 +18,7 @@ namespace apps
 {
 
 
-void HALOEXCHANGE::runOpenMPVariant(VariantID vid)
+void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -158,12 +158,12 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n HALOEXCHANGE : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n HALOEXCHANGE : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/apps/HALOEXCHANGE-OMPTarget.cpp b/src/apps/HALOEXCHANGE-OMPTarget.cpp
index cfb10f7ec..4c8f1655c 100644
--- a/src/apps/HALOEXCHANGE-OMPTarget.cpp
+++ b/src/apps/HALOEXCHANGE-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -51,7 +51,7 @@ namespace apps
   }
 
 
-void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid)
+void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -146,7 +146,7 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid)
     HALOEXCHANGE_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n HALOEXCHANGE : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n HALOEXCHANGE : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/apps/HALOEXCHANGE-Seq.cpp b/src/apps/HALOEXCHANGE-Seq.cpp
index 7ebace6f7..755a47390 100644
--- a/src/apps/HALOEXCHANGE-Seq.cpp
+++ b/src/apps/HALOEXCHANGE-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,7 +18,7 @@ namespace apps
 {
 
 
-void HALOEXCHANGE::runSeqVariant(VariantID vid)
+void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -154,7 +154,7 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n HALOEXCHANGE : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n HALOEXCHANGE : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/apps/HALOEXCHANGE.cpp b/src/apps/HALOEXCHANGE.cpp
index db7c7bb90..890fcf0a9 100644
--- a/src/apps/HALOEXCHANGE.cpp
+++ b/src/apps/HALOEXCHANGE.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -104,7 +104,7 @@ HALOEXCHANGE::~HALOEXCHANGE()
 {
 }
 
-void HALOEXCHANGE::setUp(VariantID vid)
+void HALOEXCHANGE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   m_vars.resize(m_num_vars, nullptr);
   for (Index_type v = 0; v < m_num_vars; ++v) {
@@ -132,14 +132,14 @@ void HALOEXCHANGE::setUp(VariantID vid)
   }
 }
 
-void HALOEXCHANGE::updateChecksum(VariantID vid)
+void HALOEXCHANGE::updateChecksum(VariantID vid, size_t tune_idx)
 {
   for (Real_ptr var : m_vars) {
-    checksum[vid] += calcChecksum(var, m_var_size);
+    checksum[vid][tune_idx] += calcChecksum(var, m_var_size);
   }
 }
 
-void HALOEXCHANGE::tearDown(VariantID vid)
+void HALOEXCHANGE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   for (int l = 0; l < s_num_neighbors; ++l) {
     deallocData(m_buffers[l]);
diff --git a/src/apps/HALOEXCHANGE.hpp b/src/apps/HALOEXCHANGE.hpp
index d10bd4790..5d653762a 100644
--- a/src/apps/HALOEXCHANGE.hpp
+++ b/src/apps/HALOEXCHANGE.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -84,17 +84,27 @@ class HALOEXCHANGE : public KernelBase
 
   ~HALOEXCHANGE();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   static const int s_num_neighbors = 26;
 
   Index_type m_grid_dims[3];
diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp
index 114ed61ba..52d1fca5c 100644
--- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp
+++ b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace apps
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 1024;
-
-
 #define HALOEXCHANGE_FUSED_DATA_SETUP_CUDA \
   for (Index_type v = 0; v < m_num_vars; ++v) { \
     allocAndInitCudaDeviceData(vars[v], m_vars[v], m_var_size); \
@@ -77,6 +71,8 @@ namespace apps
   deallocCudaPinnedData(unpack_var_ptrs); \
   deallocCudaPinnedData(unpack_len_ptrs);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs,
                                         Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs)
 {
@@ -87,13 +83,15 @@ __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pac
   Real_ptr   var    = pack_var_ptrs[j];
   Index_type len    = pack_len_ptrs[j];
 
-  for (Index_type i = threadIdx.x + blockIdx.x * blockDim.x;
+  for (Index_type i = threadIdx.x + blockIdx.x * block_size;
        i < len;
-       i += blockDim.x * gridDim.x) {
+       i += block_size * gridDim.x) {
     HALOEXCHANGE_FUSED_PACK_BODY;
   }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs,
                                           Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs)
 {
@@ -104,15 +102,16 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr*
   Real_ptr   var    = unpack_var_ptrs[j];
   Index_type len    = unpack_len_ptrs[j];
 
-  for (Index_type i = threadIdx.x + blockIdx.x * blockDim.x;
+  for (Index_type i = threadIdx.x + blockIdx.x * block_size;
        i < len;
-       i += blockDim.x * gridDim.x) {
+       i += block_size * gridDim.x) {
     HALOEXCHANGE_FUSED_UNPACK_BODY;
   }
 }
 
 
-void HALOEXCHANGE_FUSED::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -148,7 +147,7 @@ void HALOEXCHANGE_FUSED::runCudaVariant(VariantID vid)
       Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index;
       dim3 pack_nthreads_per_block(block_size);
       dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index);
-      haloexchange_fused_pack<<<pack_nblocks, pack_nthreads_per_block>>>(
+      haloexchange_fused_pack<block_size><<<pack_nblocks, pack_nthreads_per_block>>>(
           pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs);
       cudaErrchk( cudaGetLastError() );
       synchronize();
@@ -174,7 +173,7 @@ void HALOEXCHANGE_FUSED::runCudaVariant(VariantID vid)
       Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index;
       dim3 unpack_nthreads_per_block(block_size);
       dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index);
-      haloexchange_fused_unpack<<<unpack_nblocks, unpack_nthreads_per_block>>>(
+      haloexchange_fused_unpack<block_size><<<unpack_nblocks, unpack_nthreads_per_block>>>(
           unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs);
       cudaErrchk( cudaGetLastError() );
       synchronize();
@@ -267,10 +266,12 @@ void HALOEXCHANGE_FUSED::runCudaVariant(VariantID vid)
     HALOEXCHANGE_FUSED_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n HALOEXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n HALOEXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(HALOEXCHANGE_FUSED, Cuda)
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp
index 1288f9429..7d64d86f7 100644
--- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp
+++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace apps
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 1024;
-
-
 #define HALOEXCHANGE_FUSED_DATA_SETUP_HIP \
   for (Index_type v = 0; v < m_num_vars; ++v) { \
     allocAndInitHipDeviceData(vars[v], m_vars[v], m_var_size); \
@@ -76,6 +70,8 @@ namespace apps
   deallocHipPinnedData(unpack_var_ptrs); \
   deallocHipPinnedData(unpack_len_ptrs);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs,
                                         Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs)
 {
@@ -86,13 +82,15 @@ __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pac
   Real_ptr   var    = pack_var_ptrs[j];
   Index_type len    = pack_len_ptrs[j];
 
-  for (Index_type i = threadIdx.x + blockIdx.x * blockDim.x;
+  for (Index_type i = threadIdx.x + blockIdx.x * block_size;
        i < len;
-       i += blockDim.x * gridDim.x) {
+       i += block_size * gridDim.x) {
     HALOEXCHANGE_FUSED_PACK_BODY;
   }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs,
                                           Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs)
 {
@@ -103,15 +101,16 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr*
   Real_ptr   var    = unpack_var_ptrs[j];
   Index_type len    = unpack_len_ptrs[j];
 
-  for (Index_type i = threadIdx.x + blockIdx.x * blockDim.x;
+  for (Index_type i = threadIdx.x + blockIdx.x * block_size;
        i < len;
-       i += blockDim.x * gridDim.x) {
+       i += block_size * gridDim.x) {
     HALOEXCHANGE_FUSED_UNPACK_BODY;
   }
 }
 
 
-void HALOEXCHANGE_FUSED::runHipVariant(VariantID vid)
+template < size_t block_size >
+void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -147,7 +146,7 @@ void HALOEXCHANGE_FUSED::runHipVariant(VariantID vid)
       Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index;
       dim3 pack_nthreads_per_block(block_size);
       dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index);
-      hipLaunchKernelGGL((haloexchange_fused_pack), pack_nblocks, pack_nthreads_per_block, 0, 0,
+      hipLaunchKernelGGL((haloexchange_fused_pack<block_size>), pack_nblocks, pack_nthreads_per_block, 0, 0,
           pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs);
       hipErrchk( hipGetLastError() );
       synchronize();
@@ -173,7 +172,7 @@ void HALOEXCHANGE_FUSED::runHipVariant(VariantID vid)
       Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index;
       dim3 unpack_nthreads_per_block(block_size);
       dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index);
-      hipLaunchKernelGGL((haloexchange_fused_unpack), unpack_nblocks, unpack_nthreads_per_block, 0, 0,
+      hipLaunchKernelGGL((haloexchange_fused_unpack<block_size>), unpack_nblocks, unpack_nthreads_per_block, 0, 0,
           unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs);
       hipErrchk( hipGetLastError() );
       synchronize();
@@ -270,10 +269,12 @@ void HALOEXCHANGE_FUSED::runHipVariant(VariantID vid)
     HALOEXCHANGE_FUSED_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(HALOEXCHANGE_FUSED, Hip)
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp
index 71a7fe22f..0400c20b0 100644
--- a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp
+++ b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,7 +18,7 @@ namespace apps
 {
 
 
-void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid)
+void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -297,7 +297,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp
index 8f8199026..7c465681c 100644
--- a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp
+++ b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -87,7 +87,7 @@ namespace apps
   delete[] h_unpack_ptrs;
 
 
-void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid)
+void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -261,7 +261,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid)
     HALOEXCHANGE_FUSED_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n HALOEXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n HALOEXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp
index 4bb5207b7..984aaf724 100644
--- a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp
+++ b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,7 +18,7 @@ namespace apps
 {
 
 
-void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid)
+void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -229,7 +229,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/apps/HALOEXCHANGE_FUSED.cpp b/src/apps/HALOEXCHANGE_FUSED.cpp
index 6882cf51e..406cc654b 100644
--- a/src/apps/HALOEXCHANGE_FUSED.cpp
+++ b/src/apps/HALOEXCHANGE_FUSED.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -104,7 +104,7 @@ HALOEXCHANGE_FUSED::~HALOEXCHANGE_FUSED()
 {
 }
 
-void HALOEXCHANGE_FUSED::setUp(VariantID vid)
+void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   m_vars.resize(m_num_vars, nullptr);
   for (Index_type v = 0; v < m_num_vars; ++v) {
@@ -132,14 +132,14 @@ void HALOEXCHANGE_FUSED::setUp(VariantID vid)
   }
 }
 
-void HALOEXCHANGE_FUSED::updateChecksum(VariantID vid)
+void HALOEXCHANGE_FUSED::updateChecksum(VariantID vid, size_t tune_idx)
 {
   for (Real_ptr var : m_vars) {
-    checksum[vid] += calcChecksum(var, m_var_size);
+    checksum[vid][tune_idx] += calcChecksum(var, m_var_size);
   }
 }
 
-void HALOEXCHANGE_FUSED::tearDown(VariantID vid)
+void HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   for (int l = 0; l < s_num_neighbors; ++l) {
     deallocData(m_buffers[l]);
diff --git a/src/apps/HALOEXCHANGE_FUSED.hpp b/src/apps/HALOEXCHANGE_FUSED.hpp
index 68e81da59..e47c1e14e 100644
--- a/src/apps/HALOEXCHANGE_FUSED.hpp
+++ b/src/apps/HALOEXCHANGE_FUSED.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -128,17 +128,27 @@ class HALOEXCHANGE_FUSED : public KernelBase
 
   ~HALOEXCHANGE_FUSED();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 1024;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   static const int s_num_neighbors = 26;
 
   Index_type m_grid_dims[3];
diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp
index f1f47b5a2..4e38f769b 100644
--- a/src/apps/LTIMES-Cuda.cpp
+++ b/src/apps/LTIMES-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,14 +22,18 @@ namespace apps
 {
 
 //
-// Define thread block size for CUDA execution
+// Define thread block shape for CUDA execution
 //
-constexpr size_t z_block_sz = 2;
-constexpr size_t g_block_sz = 4;
-constexpr size_t m_block_sz = 32;
+#define m_block_sz (32)
+#define g_block_sz (gpu_block_size::greater_of_squarest_factor_pair(block_size/m_block_sz))
+#define z_block_sz (gpu_block_size::lesser_of_squarest_factor_pair(block_size/m_block_sz))
+
+#define LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \
+  m_block_sz, g_block_sz, z_block_sz
 
 #define LTIMES_THREADS_PER_BLOCK_CUDA \
-  dim3 nthreads_per_block(m_block_sz, g_block_sz, z_block_sz);
+  dim3 nthreads_per_block(LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA); \
+  static_assert(m_block_sz*g_block_sz*z_block_sz == block_size, "Invalid block_size");
 
 #define LTIMES_NBLOCKS_CUDA \
   dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(num_m, m_block_sz)), \
@@ -48,28 +52,31 @@ constexpr size_t m_block_sz = 32;
   deallocCudaDeviceData(elldat); \
   deallocCudaDeviceData(psidat);
 
+template < size_t m_block_size, size_t g_block_size, size_t z_block_size >
+__launch_bounds__(m_block_size*g_block_size*z_block_size)
 __global__ void ltimes(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat,
-                       Index_type num_d, 
+                       Index_type num_d,
                        Index_type num_m, Index_type num_g, Index_type num_z)
 {
-   Index_type m = blockIdx.x * blockDim.x + threadIdx.x;
-   Index_type g = blockIdx.y * blockDim.y + threadIdx.y;
-   Index_type z = blockIdx.z * blockDim.z + threadIdx.z;
+   Index_type m = blockIdx.x * m_block_size + threadIdx.x;
+   Index_type g = blockIdx.y * g_block_size + threadIdx.y;
+   Index_type z = blockIdx.z * z_block_size + threadIdx.z;
 
-   if (m < num_m && g < num_g && z < num_z) {  
+   if (m < num_m && g < num_g && z < num_z) {
      for (Index_type d = 0; d < num_d; ++d ) {
        LTIMES_BODY;
      }
    }
 }
 
-template< typename Lambda >
+template < size_t m_block_size, size_t g_block_size, size_t z_block_size, typename Lambda >
+__launch_bounds__(m_block_size*g_block_size*z_block_size)
 __global__ void ltimes_lam(Index_type num_m, Index_type num_g, Index_type num_z,
                            Lambda body)
 {
-   Index_type m = blockIdx.x * blockDim.x + threadIdx.x;
-   Index_type g = blockIdx.y * blockDim.y + threadIdx.y;
-   Index_type z = blockIdx.z * blockDim.z + threadIdx.z;
+   Index_type m = blockIdx.x * m_block_size + threadIdx.x;
+   Index_type g = blockIdx.y * g_block_size + threadIdx.y;
+   Index_type z = blockIdx.z * z_block_size + threadIdx.z;
 
    if (m < num_m && g < num_g && z < num_z) {
      body(z, g, m);
@@ -77,7 +84,8 @@ __global__ void ltimes_lam(Index_type num_m, Index_type num_g, Index_type num_z,
 }
 
 
-void LTIMES::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void LTIMES::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -93,8 +101,9 @@ void LTIMES::runCudaVariant(VariantID vid)
       LTIMES_THREADS_PER_BLOCK_CUDA;
       LTIMES_NBLOCKS_CUDA;
 
-      ltimes<<<nblocks, nthreads_per_block>>>(phidat, elldat, psidat,
-                                              num_d, 
+      ltimes<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+            <<<nblocks, nthreads_per_block>>>(phidat, elldat, psidat,
+                                              num_d,
                                               num_m, num_g, num_z);
       cudaErrchk( cudaGetLastError() );
 
@@ -113,7 +122,8 @@ void LTIMES::runCudaVariant(VariantID vid)
       LTIMES_THREADS_PER_BLOCK_CUDA;
       LTIMES_NBLOCKS_CUDA;
 
-      ltimes_lam<<<nblocks, nthreads_per_block>>>(num_m, num_g, num_z,
+      ltimes_lam<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                <<<nblocks, nthreads_per_block>>>(num_m, num_g, num_z,
         [=] __device__ (Index_type z, Index_type g, Index_type m) {
           for (Index_type d = 0; d < num_d; ++d ) {
             LTIMES_BODY;
@@ -139,9 +149,9 @@ void LTIMES::runCudaVariant(VariantID vid)
           RAJA::statement::Tile<1, RAJA::tile_fixed<z_block_sz>,
                                    RAJA::cuda_block_z_direct,
             RAJA::statement::Tile<2, RAJA::tile_fixed<g_block_sz>,
-                                     RAJA::cuda_block_y_direct, 
+                                     RAJA::cuda_block_y_direct,
               RAJA::statement::Tile<3, RAJA::tile_fixed<m_block_sz>,
-                                       RAJA::cuda_block_x_direct, 
+                                       RAJA::cuda_block_x_direct,
                 RAJA::statement::For<1, RAJA::cuda_thread_z_direct,     //z
                   RAJA::statement::For<2, RAJA::cuda_thread_y_direct,   //g
                     RAJA::statement::For<3, RAJA::cuda_thread_x_direct, //m
@@ -174,10 +184,12 @@ void LTIMES::runCudaVariant(VariantID vid)
       LTIMES_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n LTIMES : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n LTIMES : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(LTIMES, Cuda)
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp
index 4d28aa028..a78394d25 100644
--- a/src/apps/LTIMES-Hip.cpp
+++ b/src/apps/LTIMES-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,14 +22,17 @@ namespace apps
 {
 
 //
-// Define thread block size for Hip execution
+// Define thread block shape for Hip execution
 //
-constexpr size_t z_block_sz = 2;
-constexpr size_t g_block_sz = 4;
-constexpr size_t m_block_sz = 32;
+#define m_block_sz (32)
+#define g_block_sz (gpu_block_size::greater_of_squarest_factor_pair(block_size/m_block_sz))
+#define z_block_sz (gpu_block_size::lesser_of_squarest_factor_pair(block_size/m_block_sz))
+
+#define LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \
+  m_block_sz, g_block_sz, z_block_sz
 
 #define LTIMES_THREADS_PER_BLOCK_HIP \
-  dim3 nthreads_per_block(m_block_sz, g_block_sz, z_block_sz);
+  dim3 nthreads_per_block(LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP);
 
 #define LTIMES_NBLOCKS_HIP \
   dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(num_m, m_block_sz)), \
@@ -48,13 +51,15 @@ constexpr size_t m_block_sz = 32;
   deallocHipDeviceData(elldat); \
   deallocHipDeviceData(psidat);
 
+template < size_t m_block_size, size_t g_block_size, size_t z_block_size >
+__launch_bounds__(m_block_size*g_block_size*z_block_size)
 __global__ void ltimes(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat,
                        Index_type num_d,
                        Index_type num_m, Index_type num_g, Index_type num_z)
 {
-   Index_type m = blockIdx.x * blockDim.x + threadIdx.x;
-   Index_type g = blockIdx.y * blockDim.y + threadIdx.y;
-   Index_type z = blockIdx.z * blockDim.z + threadIdx.z;
+   Index_type m = blockIdx.x * m_block_size + threadIdx.x;
+   Index_type g = blockIdx.y * g_block_size + threadIdx.y;
+   Index_type z = blockIdx.z * z_block_size + threadIdx.z;
 
    if (m < num_m && g < num_g && z < num_z) {
      for (Index_type d = 0; d < num_d; ++d ) {
@@ -63,13 +68,14 @@ __global__ void ltimes(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat,
    }
 }
 
-template< typename Lambda >
+template < size_t m_block_size, size_t g_block_size, size_t z_block_size, typename Lambda >
+__launch_bounds__(m_block_size*g_block_size*z_block_size)
 __global__ void ltimes_lam(Index_type num_m, Index_type num_g, Index_type num_z,
                            Lambda body)
 {
-   Index_type m = blockIdx.x * blockDim.x + threadIdx.x;
-   Index_type g = blockIdx.y * blockDim.y + threadIdx.y;
-   Index_type z = blockIdx.z * blockDim.z + threadIdx.z;
+   Index_type m = blockIdx.x * m_block_size + threadIdx.x;
+   Index_type g = blockIdx.y * g_block_size + threadIdx.y;
+   Index_type z = blockIdx.z * z_block_size + threadIdx.z;
 
    if (m < num_m && g < num_g && z < num_z) {
      body(z, g, m);
@@ -77,7 +83,8 @@ __global__ void ltimes_lam(Index_type num_m, Index_type num_g, Index_type num_z,
 }
 
 
-void LTIMES::runHipVariant(VariantID vid)
+template < size_t block_size >
+void LTIMES::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -93,10 +100,10 @@ void LTIMES::runHipVariant(VariantID vid)
       LTIMES_THREADS_PER_BLOCK_HIP;
       LTIMES_NBLOCKS_HIP;
 
-      hipLaunchKernelGGL((ltimes), 
+      hipLaunchKernelGGL((ltimes<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
                          dim3(nblocks), dim3(nthreads_per_block), 0, 0,
                          phidat, elldat, psidat,
-                         num_d, 
+                         num_d,
                          num_m, num_g, num_z);
       hipErrchk( hipGetLastError() );
 
@@ -115,14 +122,14 @@ void LTIMES::runHipVariant(VariantID vid)
       LTIMES_THREADS_PER_BLOCK_HIP;
       LTIMES_NBLOCKS_HIP;
 
-      auto ltimes_lambda = 
+      auto ltimes_lambda =
         [=] __device__ (Index_type z, Index_type g, Index_type m) {
           for (Index_type d = 0; d < num_d; ++d ) {
             LTIMES_BODY;
           }
         };
 
-      hipLaunchKernelGGL((ltimes_lam<decltype(ltimes_lambda)>),
+      hipLaunchKernelGGL((ltimes_lam<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(ltimes_lambda)>),
                          dim3(nblocks), dim3(nthreads_per_block), 0, 0,
                          num_m, num_g, num_z, ltimes_lambda);
       hipErrchk( hipGetLastError() );
@@ -179,10 +186,12 @@ void LTIMES::runHipVariant(VariantID vid)
     LTIMES_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n LTIMES : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n LTIMES : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(LTIMES, Hip)
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/LTIMES-OMP.cpp b/src/apps/LTIMES-OMP.cpp
index 397d2bc11..91d0faeac 100644
--- a/src/apps/LTIMES-OMP.cpp
+++ b/src/apps/LTIMES-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
 
 
-void LTIMES::runOpenMPVariant(VariantID vid)
+void LTIMES::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -42,7 +42,7 @@ void LTIMES::runOpenMPVariant(VariantID vid)
               }
             }
           }
-        }  
+        }
 
       }
       stopTimer();
@@ -52,7 +52,7 @@ void LTIMES::runOpenMPVariant(VariantID vid)
 
     case Lambda_OpenMP : {
 
-      auto ltimes_base_lam = [=](Index_type d, Index_type z, 
+      auto ltimes_base_lam = [=](Index_type d, Index_type z,
                                  Index_type g, Index_type m) {
                                LTIMES_BODY;
                              };
@@ -85,7 +85,7 @@ void LTIMES::runOpenMPVariant(VariantID vid)
                           LTIMES_BODY_RAJA;
                         };
 
-      using EXEC_POL = 
+      using EXEC_POL =
         RAJA::KernelPolicy<
           RAJA::statement::For<1, RAJA::omp_parallel_for_exec, // z
             RAJA::statement::For<2, RAJA::loop_exec,           // g
@@ -93,7 +93,7 @@ void LTIMES::runOpenMPVariant(VariantID vid)
                 RAJA::statement::For<0, RAJA::loop_exec,       // d
                   RAJA::statement::Lambda<0>
                 >
-              > 
+              >
             >
           >
         >;
@@ -104,7 +104,7 @@ void LTIMES::runOpenMPVariant(VariantID vid)
         RAJA::kernel<EXEC_POL>( RAJA::make_tuple(IDRange(0, num_d),
                                                  IZRange(0, num_z),
                                                  IGRange(0, num_g),
-                                                 IMRange(0, num_m)), 
+                                                 IMRange(0, num_m)),
                                 ltimes_lam
                               );
 
@@ -115,12 +115,12 @@ void LTIMES::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n LTIMES : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n LTIMES : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/apps/LTIMES-OMPTarget.cpp b/src/apps/LTIMES-OMPTarget.cpp
index 07f93643b..e89e6cbfa 100644
--- a/src/apps/LTIMES-OMPTarget.cpp
+++ b/src/apps/LTIMES-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
@@ -36,7 +36,7 @@ namespace apps
   deallocOpenMPDeviceData(psidat, did);
 
 
-void LTIMES::runOpenMPTargetVariant(VariantID vid)
+void LTIMES::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -72,7 +72,7 @@ void LTIMES::runOpenMPTargetVariant(VariantID vid)
 
     LTIMES_VIEWS_RANGES_RAJA;
 
-    using EXEC_POL = 
+    using EXEC_POL =
       RAJA::KernelPolicy<
         RAJA::statement::Collapse<RAJA::omp_target_parallel_collapse_exec,
                                   RAJA::ArgList<1, 2, 3>, // z, g, m
@@ -91,7 +91,7 @@ void LTIMES::runOpenMPTargetVariant(VariantID vid)
                                                IMRange(0, num_m)),
         [=] (ID d, IZ z, IG g, IM m) {
         LTIMES_BODY_RAJA;
-      }); 
+      });
 
     }
     stopTimer();
@@ -99,7 +99,7 @@ void LTIMES::runOpenMPTargetVariant(VariantID vid)
     LTIMES_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n LTIMES : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n LTIMES : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/apps/LTIMES-Seq.cpp b/src/apps/LTIMES-Seq.cpp
index a6b4c6fe8..92fd7c319 100644
--- a/src/apps/LTIMES-Seq.cpp
+++ b/src/apps/LTIMES-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
 
 
-void LTIMES::runSeqVariant(VariantID vid)
+void LTIMES::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -45,12 +45,12 @@ void LTIMES::runSeqVariant(VariantID vid)
       stopTimer();
 
       break;
-    } 
+    }
 
 #if defined(RUN_RAJA_SEQ)
     case Lambda_Seq : {
 
-      auto ltimes_base_lam = [=](Index_type d, Index_type z, 
+      auto ltimes_base_lam = [=](Index_type d, Index_type z,
                                  Index_type g, Index_type m) {
                                LTIMES_BODY;
                              };
@@ -83,7 +83,7 @@ void LTIMES::runSeqVariant(VariantID vid)
                         };
 
 
-      using EXEC_POL = 
+      using EXEC_POL =
         RAJA::KernelPolicy<
           RAJA::statement::For<1, RAJA::loop_exec,       // z
             RAJA::statement::For<2, RAJA::loop_exec,     // g
@@ -94,27 +94,27 @@ void LTIMES::runSeqVariant(VariantID vid)
               >
             >
           >
-        >;  
+        >;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-     
+
         RAJA::kernel<EXEC_POL>( RAJA::make_tuple(IDRange(0, num_d),
                                                  IZRange(0, num_z),
                                                  IGRange(0, num_g),
-                                                 IMRange(0, num_m)), 
+                                                 IMRange(0, num_m)),
                                 ltimes_lam
                               );
 
       }
-      stopTimer(); 
+      stopTimer();
 
       break;
     }
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n LTIMES : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n LTIMES : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp
index c69a2b300..ede451a0a 100644
--- a/src/apps/LTIMES.cpp
+++ b/src/apps/LTIMES.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -31,7 +31,7 @@ LTIMES::LTIMES(const RunParams& params)
   setDefaultProblemSize(m_num_d_default * m_num_g_default * m_num_z_default);
   setDefaultReps(50);
 
-  m_num_z = std::max( getTargetProblemSize() / 
+  m_num_z = std::max( getTargetProblemSize() /
                       (m_num_d_default * m_num_g_default),
                       Index_type(1) );
   m_num_g = m_num_g_default;
@@ -54,7 +54,7 @@ LTIMES::LTIMES(const RunParams& params)
 
   checksum_scale_factor = 0.001 *
               ( static_cast<Checksum_type>(getDefaultProblemSize()) /
-                                           getActualProblemSize() ); 
+                                           getActualProblemSize() );
 
   setUsesFeature(Kernel);
   setUsesFeature(View);
@@ -83,19 +83,19 @@ LTIMES::~LTIMES()
 {
 }
 
-void LTIMES::setUp(VariantID vid)
+void LTIMES::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_phidat, int(m_philen), Real_type(0.0), vid);
   allocAndInitData(m_elldat, int(m_elllen), vid);
   allocAndInitData(m_psidat, int(m_psilen), vid);
 }
 
-void LTIMES::updateChecksum(VariantID vid)
+void LTIMES::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_phidat, m_philen, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_phidat, m_philen, checksum_scale_factor );
 }
 
-void LTIMES::tearDown(VariantID vid)
+void LTIMES::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
 
diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp
index 6177873be..31eae0f83 100644
--- a/src/apps/LTIMES.hpp
+++ b/src/apps/LTIMES.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -107,17 +107,28 @@ class LTIMES : public KernelBase
 
   ~LTIMES();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
+                                                         gpu_block_size::MultipleOf<32>>;
+
   Real_ptr m_phidat;
   Real_ptr m_elldat;
   Real_ptr m_psidat;
diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp
index 1a7403ece..b363f0049 100644
--- a/src/apps/LTIMES_NOVIEW-Cuda.cpp
+++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,14 +22,17 @@ namespace apps
 {
 
 //
-// Define thread block size for CUDA execution
+// Define thread block shape for CUDA execution
 //
-constexpr size_t z_block_sz = 2;
-constexpr size_t g_block_sz = 4;
-constexpr size_t m_block_sz = 32;
+#define m_block_sz (32)
+#define g_block_sz (gpu_block_size::greater_of_squarest_factor_pair(block_size/m_block_sz))
+#define z_block_sz (gpu_block_size::lesser_of_squarest_factor_pair(block_size/m_block_sz))
+
+#define LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \
+  m_block_sz, g_block_sz, z_block_sz
 
 #define LTIMES_NOVIEW_THREADS_PER_BLOCK_CUDA \
-  dim3 nthreads_per_block(m_block_sz, g_block_sz, z_block_sz);
+  dim3 nthreads_per_block(LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA);
 
 #define LTIMES_NOVIEW_NBLOCKS_CUDA \
   dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(num_m, m_block_sz)), \
@@ -49,13 +52,15 @@ constexpr size_t m_block_sz = 32;
   deallocCudaDeviceData(elldat); \
   deallocCudaDeviceData(psidat);
 
+template < size_t m_block_size, size_t g_block_size, size_t z_block_size >
+__launch_bounds__(m_block_size*g_block_size*z_block_size)
 __global__ void ltimes_noview(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat,
                               Index_type num_d,
                               Index_type num_m, Index_type num_g, Index_type num_z)
 {
-   Index_type m = blockIdx.x * blockDim.x + threadIdx.x;
-   Index_type g = blockIdx.y * blockDim.y + threadIdx.y;
-   Index_type z = blockIdx.z * blockDim.z + threadIdx.z;
+   Index_type m = blockIdx.x * m_block_size + threadIdx.x;
+   Index_type g = blockIdx.y * g_block_size + threadIdx.y;
+   Index_type z = blockIdx.z * z_block_size + threadIdx.z;
 
    if (m < num_m && g < num_g && z < num_z) {
      for (Index_type d = 0; d < num_d; ++d ) {
@@ -64,13 +69,14 @@ __global__ void ltimes_noview(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat,
    }
 }
 
-template< typename Lambda >
+template < size_t m_block_size, size_t g_block_size, size_t z_block_size, typename Lambda >
+__launch_bounds__(m_block_size*g_block_size*z_block_size)
 __global__ void ltimes_noview_lam(Index_type num_m, Index_type num_g, Index_type num_z,
                                   Lambda body)
 {
-   Index_type m = blockIdx.x * blockDim.x + threadIdx.x;
-   Index_type g = blockIdx.y * blockDim.y + threadIdx.y;
-   Index_type z = blockIdx.z * blockDim.z + threadIdx.z;
+   Index_type m = blockIdx.x * m_block_size + threadIdx.x;
+   Index_type g = blockIdx.y * g_block_size + threadIdx.y;
+   Index_type z = blockIdx.z * z_block_size + threadIdx.z;
 
    if (m < num_m && g < num_g && z < num_z) {
      body(z, g, m);
@@ -78,7 +84,8 @@ __global__ void ltimes_noview_lam(Index_type num_m, Index_type num_g, Index_type
 }
 
 
-void LTIMES_NOVIEW::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -94,7 +101,8 @@ void LTIMES_NOVIEW::runCudaVariant(VariantID vid)
       LTIMES_NOVIEW_THREADS_PER_BLOCK_CUDA;
       LTIMES_NOVIEW_NBLOCKS_CUDA;
 
-      ltimes_noview<<<nblocks, nthreads_per_block>>>(phidat, elldat, psidat,
+      ltimes_noview<LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                   <<<nblocks, nthreads_per_block>>>(phidat, elldat, psidat,
                                                      num_d,
                                                      num_m, num_g, num_z);
       cudaErrchk( cudaGetLastError() );
@@ -114,7 +122,8 @@ void LTIMES_NOVIEW::runCudaVariant(VariantID vid)
       LTIMES_NOVIEW_THREADS_PER_BLOCK_CUDA;
       LTIMES_NOVIEW_NBLOCKS_CUDA;
 
-      ltimes_noview_lam<<<nblocks, nthreads_per_block>>>(num_m, num_g, num_z,
+      ltimes_noview_lam<LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                       <<<nblocks, nthreads_per_block>>>(num_m, num_g, num_z,
         [=] __device__ (Index_type z, Index_type g, Index_type m) {
           for (Index_type d = 0; d < num_d; ++d ) {
             LTIMES_NOVIEW_BODY;
@@ -173,10 +182,12 @@ void LTIMES_NOVIEW::runCudaVariant(VariantID vid)
     LTIMES_NOVIEW_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n LTIMES_NOVIEW : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n LTIMES_NOVIEW : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(LTIMES_NOVIEW, Cuda)
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/LTIMES_NOVIEW-Hip.cpp b/src/apps/LTIMES_NOVIEW-Hip.cpp
index 8f36737d8..47a8c8956 100644
--- a/src/apps/LTIMES_NOVIEW-Hip.cpp
+++ b/src/apps/LTIMES_NOVIEW-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,14 +22,17 @@ namespace apps
 {
 
 //
-// Define thread block size for Hip execution
+// Define thread block shape for Hip execution
 //
-constexpr size_t z_block_sz = 2;
-constexpr size_t g_block_sz = 4;
-constexpr size_t m_block_sz = 32;
+#define m_block_sz (32)
+#define g_block_sz (gpu_block_size::greater_of_squarest_factor_pair(block_size/m_block_sz))
+#define z_block_sz (gpu_block_size::lesser_of_squarest_factor_pair(block_size/m_block_sz))
+
+#define LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \
+  m_block_sz, g_block_sz, z_block_sz
 
 #define LTIMES_NOVIEW_THREADS_PER_BLOCK_HIP \
-  dim3 nthreads_per_block(m_block_sz, g_block_sz, z_block_sz);
+  dim3 nthreads_per_block(LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP);
 
 #define LTIMES_NOVIEW_NBLOCKS_HIP \
   dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(num_m, m_block_sz)), \
@@ -48,13 +51,15 @@ constexpr size_t m_block_sz = 32;
   deallocHipDeviceData(elldat); \
   deallocHipDeviceData(psidat);
 
+template < size_t m_block_size, size_t g_block_size, size_t z_block_size >
+__launch_bounds__(m_block_size*g_block_size*z_block_size)
 __global__ void ltimes_noview(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat,
                               Index_type num_d,
                               Index_type num_m, Index_type num_g, Index_type num_z)
 {
-   Index_type m = blockIdx.x * blockDim.x + threadIdx.x;
-   Index_type g = blockIdx.y * blockDim.y + threadIdx.y;
-   Index_type z = blockIdx.z * blockDim.z + threadIdx.z;
+   Index_type m = blockIdx.x * m_block_size + threadIdx.x;
+   Index_type g = blockIdx.y * g_block_size + threadIdx.y;
+   Index_type z = blockIdx.z * z_block_size + threadIdx.z;
 
    if (m < num_m && g < num_g && z < num_z) {
      for (Index_type d = 0; d < num_d; ++d ) {
@@ -63,13 +68,14 @@ __global__ void ltimes_noview(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat,
    }
 }
 
-template< typename Lambda >
+template < size_t m_block_size, size_t g_block_size, size_t z_block_size, typename Lambda >
+__launch_bounds__(m_block_size*g_block_size*z_block_size)
 __global__ void ltimes_noview_lam(Index_type num_m, Index_type num_g, Index_type num_z,
                                   Lambda body)
 {
-   Index_type m = blockIdx.x * blockDim.x + threadIdx.x;
-   Index_type g = blockIdx.y * blockDim.y + threadIdx.y;
-   Index_type z = blockIdx.z * blockDim.z + threadIdx.z;
+   Index_type m = blockIdx.x * m_block_size + threadIdx.x;
+   Index_type g = blockIdx.y * g_block_size + threadIdx.y;
+   Index_type z = blockIdx.z * z_block_size + threadIdx.z;
 
    if (m < num_m && g < num_g && z < num_z) {
      body(z, g, m);
@@ -77,7 +83,8 @@ __global__ void ltimes_noview_lam(Index_type num_m, Index_type num_g, Index_type
 }
 
 
-void LTIMES_NOVIEW::runHipVariant(VariantID vid)
+template < size_t block_size >
+void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -93,10 +100,10 @@ void LTIMES_NOVIEW::runHipVariant(VariantID vid)
       LTIMES_NOVIEW_THREADS_PER_BLOCK_HIP;
       LTIMES_NOVIEW_NBLOCKS_HIP;
 
-      hipLaunchKernelGGL((ltimes_noview), 
-                         dim3(nblocks), dim3(nthreads_per_block), 0, 0, 
+      hipLaunchKernelGGL((ltimes_noview<LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+                         dim3(nblocks), dim3(nthreads_per_block), 0, 0,
                          phidat, elldat, psidat,
-                         num_d, 
+                         num_d,
                          num_m, num_g, num_z);
       hipErrchk( hipGetLastError() );
 
@@ -115,15 +122,15 @@ void LTIMES_NOVIEW::runHipVariant(VariantID vid)
       LTIMES_NOVIEW_THREADS_PER_BLOCK_HIP;
       LTIMES_NOVIEW_NBLOCKS_HIP;
 
-      auto ltimes_noview_lambda = 
+      auto ltimes_noview_lambda =
         [=] __device__ (Index_type z, Index_type g, Index_type m) {
           for (Index_type d = 0; d < num_d; ++d ) {
             LTIMES_NOVIEW_BODY;
           }
       };
 
-      hipLaunchKernelGGL((ltimes_noview_lam<decltype(ltimes_noview_lambda)>), 
-                         dim3(nblocks), dim3(nthreads_per_block), 0, 0, 
+      hipLaunchKernelGGL((ltimes_noview_lam<LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(ltimes_noview_lambda)>),
+                         dim3(nblocks), dim3(nthreads_per_block), 0, 0,
                          num_m, num_g, num_z,
                          ltimes_noview_lambda);
       hipErrchk( hipGetLastError() );
@@ -178,10 +185,12 @@ void LTIMES_NOVIEW::runHipVariant(VariantID vid)
     LTIMES_NOVIEW_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n LTIMES_NOVIEW : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n LTIMES_NOVIEW : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(LTIMES_NOVIEW, Hip)
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/LTIMES_NOVIEW-OMP.cpp b/src/apps/LTIMES_NOVIEW-OMP.cpp
index f47fec499..e41853651 100644
--- a/src/apps/LTIMES_NOVIEW-OMP.cpp
+++ b/src/apps/LTIMES_NOVIEW-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,21 +12,21 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
 
 
-void LTIMES_NOVIEW::runOpenMPVariant(VariantID vid)
+void LTIMES_NOVIEW::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
   const Index_type run_reps = getRunReps();
 
   LTIMES_NOVIEW_DATA_SETUP;
- 
-  auto ltimesnoview_lam = [=](Index_type d, Index_type z, 
+
+  auto ltimesnoview_lam = [=](Index_type d, Index_type z,
                               Index_type g, Index_type m) {
                                 LTIMES_NOVIEW_BODY;
                           };
@@ -47,7 +47,7 @@ void LTIMES_NOVIEW::runOpenMPVariant(VariantID vid)
               }
             }
           }
-        }  
+        }
 
       }
       stopTimer();
@@ -109,12 +109,12 @@ void LTIMES_NOVIEW::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n LTIMES_NOVIEW : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n LTIMES_NOVIEW : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/apps/LTIMES_NOVIEW-OMPTarget.cpp b/src/apps/LTIMES_NOVIEW-OMPTarget.cpp
index 3734b889f..ca49c8859 100644
--- a/src/apps/LTIMES_NOVIEW-OMPTarget.cpp
+++ b/src/apps/LTIMES_NOVIEW-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
@@ -36,7 +36,7 @@ namespace apps
   deallocOpenMPDeviceData(psidat, did);
 
 
-void LTIMES_NOVIEW::runOpenMPTargetVariant(VariantID vid)
+void LTIMES_NOVIEW::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -97,7 +97,7 @@ void LTIMES_NOVIEW::runOpenMPTargetVariant(VariantID vid)
     LTIMES_NOVIEW_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n LTIMES_NOVIEW : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n LTIMES_NOVIEW : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/apps/LTIMES_NOVIEW-Seq.cpp b/src/apps/LTIMES_NOVIEW-Seq.cpp
index 7d98e5c53..7da062715 100644
--- a/src/apps/LTIMES_NOVIEW-Seq.cpp
+++ b/src/apps/LTIMES_NOVIEW-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,19 +12,19 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
 
 
-void LTIMES_NOVIEW::runSeqVariant(VariantID vid)
+void LTIMES_NOVIEW::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
   LTIMES_NOVIEW_DATA_SETUP;
- 
-  auto ltimesnoview_lam = [=](Index_type d, Index_type z, 
+
+  auto ltimesnoview_lam = [=](Index_type d, Index_type z,
                               Index_type g, Index_type m) {
                                 LTIMES_NOVIEW_BODY;
                           };
@@ -50,7 +50,7 @@ void LTIMES_NOVIEW::runSeqVariant(VariantID vid)
       stopTimer();
 
       break;
-    } 
+    }
 
 #if defined(RUN_RAJA_SEQ)
     case Lambda_Seq : {
@@ -100,14 +100,14 @@ void LTIMES_NOVIEW::runSeqVariant(VariantID vid)
                               );
 
       }
-      stopTimer(); 
+      stopTimer();
 
       break;
     }
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n LTIMES_NOVIEW : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n LTIMES_NOVIEW : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp
index 5d341cea3..c0c0f7413 100644
--- a/src/apps/LTIMES_NOVIEW.cpp
+++ b/src/apps/LTIMES_NOVIEW.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -31,7 +31,7 @@ LTIMES_NOVIEW::LTIMES_NOVIEW(const RunParams& params)
   setDefaultProblemSize(m_num_d_default * m_num_g_default * m_num_z_default);
   setDefaultReps(50);
 
-  m_num_z = std::max( getTargetProblemSize() / 
+  m_num_z = std::max( getTargetProblemSize() /
                       (m_num_d_default * m_num_g_default),
                       Index_type(1) );
   m_num_g = m_num_g_default;
@@ -82,19 +82,19 @@ LTIMES_NOVIEW::~LTIMES_NOVIEW()
 {
 }
 
-void LTIMES_NOVIEW::setUp(VariantID vid)
+void LTIMES_NOVIEW::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_phidat, int(m_philen), Real_type(0.0), vid);
   allocAndInitData(m_elldat, int(m_elllen), vid);
   allocAndInitData(m_psidat, int(m_psilen), vid);
 }
 
-void LTIMES_NOVIEW::updateChecksum(VariantID vid)
+void LTIMES_NOVIEW::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_phidat, m_philen, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_phidat, m_philen, checksum_scale_factor );
 }
 
-void LTIMES_NOVIEW::tearDown(VariantID vid)
+void LTIMES_NOVIEW::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
 
diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp
index 24c524ecc..1385864fb 100644
--- a/src/apps/LTIMES_NOVIEW.hpp
+++ b/src/apps/LTIMES_NOVIEW.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -57,17 +57,28 @@ class LTIMES_NOVIEW : public KernelBase
 
   ~LTIMES_NOVIEW();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
+                                                         gpu_block_size::MultipleOf<32>>;
+
   Real_ptr m_phidat;
   Real_ptr m_elldat;
   Real_ptr m_psidat;
diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp
index 0de50ee15..b872a2a3c 100644
--- a/src/apps/MASS3DPA-Cuda.cpp
+++ b/src/apps/MASS3DPA-Cuda.cpp
@@ -1,11 +1,14 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+// Uncomment to add compiler directives loop unrolling
+//#define USE_RAJAPERF_UNROLL
+
 #include "MASS3DPA.hpp"
 
 #include "RAJA/RAJA.hpp"
@@ -19,95 +22,88 @@
 namespace rajaperf {
 namespace apps {
 
-#define MASS3DPA_DATA_SETUP_CUDA                                        \
-  allocAndInitCudaDeviceData(B, m_B, Q1D *D1D);                         \
-  allocAndInitCudaDeviceData(Bt, m_Bt, Q1D *D1D);                       \
-  allocAndInitCudaDeviceData(D, m_D, Q1D *Q1D *Q1D *m_NE);              \
-  allocAndInitCudaDeviceData(X, m_X, D1D *D1D *D1D *m_NE);              \
-  allocAndInitCudaDeviceData(Y, m_Y, D1D *D1D *D1D *m_NE);
+#define MASS3DPA_DATA_SETUP_CUDA                                         \
+  allocAndInitCudaDeviceData(B, m_B, MPA_Q1D *MPA_D1D);                  \
+  allocAndInitCudaDeviceData(Bt, m_Bt, MPA_Q1D *MPA_D1D);                \
+  allocAndInitCudaDeviceData(D, m_D, MPA_Q1D *MPA_Q1D *MPA_Q1D *m_NE);   \
+  allocAndInitCudaDeviceData(X, m_X, MPA_D1D *MPA_D1D *MPA_D1D *m_NE);   \
+  allocAndInitCudaDeviceData(Y, m_Y, MPA_D1D *MPA_D1D *MPA_D1D *m_NE);
 
 #define MASS3DPA_DATA_TEARDOWN_CUDA                                      \
-  getCudaDeviceData(m_Y, Y, D1D *D1D *D1D *m_NE);                        \
+  getCudaDeviceData(m_Y, Y, MPA_D1D *MPA_D1D *MPA_D1D *m_NE);            \
   deallocCudaDeviceData(B);                                              \
   deallocCudaDeviceData(Bt);                                             \
   deallocCudaDeviceData(D);                                              \
   deallocCudaDeviceData(X);                                              \
   deallocCudaDeviceData(Y);
 
-//#define USE_RAJA_UNROLL
-#define RAJA_DIRECT_PRAGMA(X) _Pragma(#X)
-#if defined(USE_RAJA_UNROLL)
-#define RAJA_UNROLL(N) RAJA_DIRECT_PRAGMA(unroll(N))
-#else
-#define RAJA_UNROLL(N)
-#endif
-#define FOREACH_THREAD(i, k, N)                                                \
-  for (int i = threadIdx.k; i < N; i += blockDim.k)
-
-__global__ void Mass3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt,
+template < size_t block_size >
+  __launch_bounds__(block_size)
+__global__ void Mass3DPA(const Real_ptr B, const Real_ptr Bt,
                          const Real_ptr D, const Real_ptr X, Real_ptr Y) {
 
   const int e = blockIdx.x;
 
   MASS3DPA_0_GPU
 
-  FOREACH_THREAD(dy, y, D1D) {
-    FOREACH_THREAD(dx, x, D1D){
+  GPU_FOREACH_THREAD(dy, y, MPA_D1D) {
+    GPU_FOREACH_THREAD(dx, x, MPA_D1D){
       MASS3DPA_1
     }
-    FOREACH_THREAD(dx, x, Q1D) {
+    GPU_FOREACH_THREAD(dx, x, MPA_Q1D) {
       MASS3DPA_2
     }
   }
   __syncthreads();
-  FOREACH_THREAD(dy, y, D1D) {
-    FOREACH_THREAD(qx, x, Q1D) {
+  GPU_FOREACH_THREAD(dy, y, MPA_D1D) {
+    GPU_FOREACH_THREAD(qx, x, MPA_Q1D) {
       MASS3DPA_3
     }
   }
   __syncthreads();
-  FOREACH_THREAD(qy, y, Q1D) {
-    FOREACH_THREAD(qx, x, Q1D) {
+  GPU_FOREACH_THREAD(qy, y, MPA_Q1D) {
+    GPU_FOREACH_THREAD(qx, x, MPA_Q1D) {
       MASS3DPA_4
     }
   }
   __syncthreads();
-  FOREACH_THREAD(qy, y, Q1D) {
-    FOREACH_THREAD(qx, x, Q1D) {
+  GPU_FOREACH_THREAD(qy, y, MPA_Q1D) {
+    GPU_FOREACH_THREAD(qx, x, MPA_Q1D) {
       MASS3DPA_5
     }
   }
 
   __syncthreads();
-  FOREACH_THREAD(d, y, D1D) {
-    FOREACH_THREAD(q, x, Q1D) {
+  GPU_FOREACH_THREAD(d, y, MPA_D1D) {
+    GPU_FOREACH_THREAD(q, x, MPA_Q1D) {
       MASS3DPA_6
     }
   }
 
   __syncthreads();
-  FOREACH_THREAD(qy, y, Q1D) {
-    FOREACH_THREAD(dx, x, D1D) {
+  GPU_FOREACH_THREAD(qy, y, MPA_Q1D) {
+    GPU_FOREACH_THREAD(dx, x, MPA_D1D) {
       MASS3DPA_7
     }
   }
   __syncthreads();
 
-  FOREACH_THREAD(dy, y, D1D) {
-    FOREACH_THREAD(dx, x, D1D) {
+  GPU_FOREACH_THREAD(dy, y, MPA_D1D) {
+    GPU_FOREACH_THREAD(dx, x, MPA_D1D) {
       MASS3DPA_8
     }
   }
 
   __syncthreads();
-  FOREACH_THREAD(dy, y, D1D) {
-    FOREACH_THREAD(dx, x, D1D) {
+  GPU_FOREACH_THREAD(dy, y, MPA_D1D) {
+    GPU_FOREACH_THREAD(dx, x, MPA_D1D) {
       MASS3DPA_9
     }
   }
 }
 
-void MASS3DPA::runCudaVariant(VariantID vid) {
+template < size_t block_size >
+void MASS3DPA::runCudaVariantImpl(VariantID vid) {
   const Index_type run_reps = getRunReps();
 
   MASS3DPA_DATA_SETUP;
@@ -121,9 +117,9 @@ void MASS3DPA::runCudaVariant(VariantID vid) {
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      dim3 nthreads_per_block(Q1D, Q1D, 1);
+      dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1);
 
-      Mass3DPA<<<NE, nthreads_per_block>>>(NE, B, Bt, D, X, Y);
+      Mass3DPA<block_size><<<NE, nthreads_per_block>>>(B, Bt, D, X, Y);
 
       cudaErrchk( cudaGetLastError() );
     }
@@ -138,29 +134,22 @@ void MASS3DPA::runCudaVariant(VariantID vid) {
 
     MASS3DPA_DATA_SETUP_CUDA;
 
-    using launch_policy = RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t
-                                                   ,RAJA::expt::cuda_launch_t<true>
-                                                   >;
+    constexpr bool async = true;
+
+    using launch_policy = RAJA::expt::LaunchPolicy<RAJA::expt::cuda_launch_t<async, MPA_Q1D*MPA_Q1D>>;
 
-    using outer_x = RAJA::expt::LoopPolicy<RAJA::loop_exec
-                                           ,RAJA::cuda_block_x_direct
-                                           >;
+    using outer_x = RAJA::expt::LoopPolicy<RAJA::cuda_block_x_direct>;
 
-    using inner_x = RAJA::expt::LoopPolicy<RAJA::loop_exec
-                                             ,RAJA::cuda_thread_x_loop
-                                             >;
+    using inner_x = RAJA::expt::LoopPolicy<RAJA::cuda_thread_x_loop>;
 
-    using inner_y = RAJA::expt::LoopPolicy<RAJA::loop_exec
-                                             ,RAJA::cuda_thread_y_loop
-                                             >;
+    using inner_y = RAJA::expt::LoopPolicy<RAJA::cuda_thread_y_loop>;
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       RAJA::expt::launch<launch_policy>(
-        RAJA::expt::DEVICE,
         RAJA::expt::Grid(RAJA::expt::Teams(NE),
-                         RAJA::expt::Threads(Q1D, Q1D, 1)),
+                         RAJA::expt::Threads(MPA_Q1D, MPA_Q1D, 1)),
         [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
 
           RAJA::expt::loop<outer_x>(ctx, RAJA::RangeSegment(0, NE),
@@ -168,15 +157,15 @@ void MASS3DPA::runCudaVariant(VariantID vid) {
 
               MASS3DPA_0_GPU
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, D1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                 [&](int dy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, D1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                     [&](int dx) {
                       MASS3DPA_1
                     }
                   );  // RAJA::expt::loop<inner_x>
 
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, Q1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                     [&](int dx) {
                       MASS3DPA_2
                     }
@@ -186,9 +175,9 @@ void MASS3DPA::runCudaVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, D1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                 [&](int dy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, Q1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                     [&](int qx) {
                       MASS3DPA_3
                     }
@@ -198,21 +187,21 @@ void MASS3DPA::runCudaVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, Q1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                 [&](int qy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, Q1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                     [&](int qx) {
                       MASS3DPA_4
                     }
-                  );  // RAJA::expt::loop<inner_x> 
+                  );  // RAJA::expt::loop<inner_x>
                 }
               );  // RAJA::expt::loop<inner_y>
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, Q1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                 [&](int qy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, Q1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                     [&](int qx) {
                       MASS3DPA_5
                     }
@@ -222,9 +211,9 @@ void MASS3DPA::runCudaVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, D1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                 [&](int d) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, Q1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                     [&](int q) {
                       MASS3DPA_6
                     }
@@ -234,21 +223,21 @@ void MASS3DPA::runCudaVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, Q1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                 [&](int qy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, D1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                     [&](int dx) {
                       MASS3DPA_7
                     }
-                  );  // RAJA::expt::loop<inner_x> 
+                  );  // RAJA::expt::loop<inner_x>
                 }
               );  // RAJA::expt::loop<inner_y>
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, D1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                 [&](int dy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, D1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                     [&](int dx) {
                       MASS3DPA_8
                     }
@@ -258,9 +247,9 @@ void MASS3DPA::runCudaVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, D1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                 [&](int dy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, D1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                     [&](int dx) {
                       MASS3DPA_9
                     }
@@ -284,12 +273,14 @@ void MASS3DPA::runCudaVariant(VariantID vid) {
 
   default: {
 
-    std::cout << "\n MASS3DPA : Unknown Cuda variant id = " << vid << std::endl;
+    getCout() << "\n MASS3DPA : Unknown Cuda variant id = " << vid << std::endl;
     break;
   }
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MASS3DPA, Cuda)
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp
index a53fe3cf5..804a858fa 100644
--- a/src/apps/MASS3DPA-Hip.cpp
+++ b/src/apps/MASS3DPA-Hip.cpp
@@ -1,11 +1,14 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+// Uncomment to add compiler directives loop unrolling
+//#define USE_RAJAPERF_UNROLL
+
 #include "MASS3DPA.hpp"
 
 #include "RAJA/RAJA.hpp"
@@ -20,94 +23,87 @@ namespace rajaperf {
 namespace apps {
 
 #define MASS3DPA_DATA_SETUP_HIP                                           \
-  allocAndInitHipDeviceData(B, m_B, Q1D *D1D);                            \
-  allocAndInitHipDeviceData(Bt, m_Bt, Q1D *D1D);                          \
-  allocAndInitHipDeviceData(D, m_D, Q1D *Q1D *Q1D *m_NE);                 \
-  allocAndInitHipDeviceData(X, m_X, D1D *D1D *D1D *m_NE);                 \
-  allocAndInitHipDeviceData(Y, m_Y, D1D *D1D *D1D *m_NE);
+  allocAndInitHipDeviceData(B, m_B, MPA_Q1D *MPA_D1D);                    \
+  allocAndInitHipDeviceData(Bt, m_Bt, MPA_Q1D *MPA_D1D);                  \
+  allocAndInitHipDeviceData(D, m_D, MPA_Q1D *MPA_Q1D *MPA_Q1D *m_NE);     \
+  allocAndInitHipDeviceData(X, m_X, MPA_D1D *MPA_D1D *MPA_D1D *m_NE);     \
+  allocAndInitHipDeviceData(Y, m_Y, MPA_D1D *MPA_D1D *MPA_D1D *m_NE);
 
 #define MASS3DPA_DATA_TEARDOWN_HIP                                        \
-  getHipDeviceData(m_Y, Y, D1D *D1D *D1D *m_NE);                          \
+  getHipDeviceData(m_Y, Y, MPA_D1D *MPA_D1D *MPA_D1D *m_NE);              \
   deallocHipDeviceData(B);                                                \
   deallocHipDeviceData(Bt);                                               \
   deallocHipDeviceData(D);                                                \
   deallocHipDeviceData(X);                                                \
   deallocHipDeviceData(Y);
 
-//#define USE_RAJA_UNROLL
-#define RAJA_DIRECT_PRAGMA(X) _Pragma(#X)
-#if defined(USE_RAJA_UNROLL)
-#define RAJA_UNROLL(N) RAJA_DIRECT_PRAGMA(unroll(N))
-#else
-#define RAJA_UNROLL(N)
-#endif
-#define FOREACH_THREAD(i, k, N)                                                \
-  for(int i=hipThreadIdx_ ##k; i<N; i+=hipBlockDim_ ##k)
-
-__global__ void Mass3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt,
+template < size_t block_size >
+  __launch_bounds__(block_size)
+__global__ void Mass3DPA(const Real_ptr B, const Real_ptr Bt,
                          const Real_ptr D, const Real_ptr X, Real_ptr Y) {
 
   const int e = hipBlockIdx_x;
 
   MASS3DPA_0_GPU
 
-  FOREACH_THREAD(dy, y, D1D) {
-    FOREACH_THREAD(dx, x, D1D){
+  GPU_FOREACH_THREAD(dy, y, MPA_D1D) {
+    GPU_FOREACH_THREAD(dx, x, MPA_D1D){
       MASS3DPA_1
     }
-    FOREACH_THREAD(dx, x, Q1D) {
+    GPU_FOREACH_THREAD(dx, x, MPA_Q1D) {
       MASS3DPA_2
     }
   }
   __syncthreads();
-  FOREACH_THREAD(dy, y, D1D) {
-    FOREACH_THREAD(qx, x, Q1D) {
+  GPU_FOREACH_THREAD(dy, y, MPA_D1D) {
+    GPU_FOREACH_THREAD(qx, x, MPA_Q1D) {
       MASS3DPA_3
     }
   }
   __syncthreads();
-  FOREACH_THREAD(qy, y, Q1D) {
-    FOREACH_THREAD(qx, x, Q1D) {
+  GPU_FOREACH_THREAD(qy, y, MPA_Q1D) {
+    GPU_FOREACH_THREAD(qx, x, MPA_Q1D) {
       MASS3DPA_4
     }
   }
   __syncthreads();
-  FOREACH_THREAD(qy, y, Q1D) {
-    FOREACH_THREAD(qx, x, Q1D) {
+  GPU_FOREACH_THREAD(qy, y, MPA_Q1D) {
+    GPU_FOREACH_THREAD(qx, x, MPA_Q1D) {
       MASS3DPA_5
     }
   }
 
   __syncthreads();
-  FOREACH_THREAD(d, y, D1D) {
-    FOREACH_THREAD(q, x, Q1D) {
+  GPU_FOREACH_THREAD(d, y, MPA_D1D) {
+    GPU_FOREACH_THREAD(q, x, MPA_Q1D) {
       MASS3DPA_6
     }
   }
 
   __syncthreads();
-  FOREACH_THREAD(qy, y, Q1D) {
-    FOREACH_THREAD(dx, x, D1D) {
+  GPU_FOREACH_THREAD(qy, y, MPA_Q1D) {
+    GPU_FOREACH_THREAD(dx, x, MPA_D1D) {
       MASS3DPA_7
     }
   }
   __syncthreads();
 
-  FOREACH_THREAD(dy, y, D1D) {
-    FOREACH_THREAD(dx, x, D1D) {
+  GPU_FOREACH_THREAD(dy, y, MPA_D1D) {
+    GPU_FOREACH_THREAD(dx, x, MPA_D1D) {
       MASS3DPA_8
     }
   }
 
   __syncthreads();
-  FOREACH_THREAD(dy, y, D1D) {
-    FOREACH_THREAD(dx, x, D1D) {
+  GPU_FOREACH_THREAD(dy, y, MPA_D1D) {
+    GPU_FOREACH_THREAD(dx, x, MPA_D1D) {
       MASS3DPA_9
     }
   }
 }
 
-void MASS3DPA::runHipVariant(VariantID vid) {
+template < size_t block_size >
+void MASS3DPA::runHipVariantImpl(VariantID vid) {
   const Index_type run_reps = getRunReps();
 
   MASS3DPA_DATA_SETUP;
@@ -118,14 +114,14 @@ void MASS3DPA::runHipVariant(VariantID vid) {
 
     MASS3DPA_DATA_SETUP_HIP;
 
-    dim3 grid_size(NE);
-    dim3 block_size(Q1D, Q1D, 1);
+    dim3 nblocks(NE);
+    dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      hipLaunchKernelGGL((Mass3DPA), dim3(grid_size), dim3(block_size), 0, 0,
-                         NE, B, Bt, D, X, Y);
+      hipLaunchKernelGGL((Mass3DPA<block_size>), dim3(nblocks), dim3(nthreads_per_block), 0, 0,
+                         B, Bt, D, X, Y);
 
       hipErrchk( hipGetLastError() );
 
@@ -141,44 +137,37 @@ void MASS3DPA::runHipVariant(VariantID vid) {
 
     MASS3DPA_DATA_SETUP_HIP;
 
-    using launch_policy = RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t
-                                                   ,RAJA::expt::hip_launch_t<true>
-                                                   >;
+    constexpr bool async = true;
+
+    using launch_policy = RAJA::expt::LaunchPolicy<RAJA::expt::hip_launch_t<async, MPA_Q1D*MPA_Q1D>>;
 
-    using outer_x = RAJA::expt::LoopPolicy<RAJA::loop_exec
-                                           ,RAJA::hip_block_x_direct
-                                           >;
+    using outer_x = RAJA::expt::LoopPolicy<RAJA::hip_block_x_direct>;
 
-    using inner_x = RAJA::expt::LoopPolicy<RAJA::loop_exec
-                                             ,RAJA::hip_thread_x_loop
-                                             >;
+    using inner_x = RAJA::expt::LoopPolicy<RAJA::hip_thread_x_loop>;
 
-    using inner_y = RAJA::expt::LoopPolicy<RAJA::loop_exec
-                                             ,RAJA::hip_thread_y_loop
-                                             >;
+    using inner_y = RAJA::expt::LoopPolicy<RAJA::hip_thread_y_loop>;
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       RAJA::expt::launch<launch_policy>(
-        RAJA::expt::DEVICE,
         RAJA::expt::Grid(RAJA::expt::Teams(NE),
-                         RAJA::expt::Threads(Q1D, Q1D, 1)),
+                         RAJA::expt::Threads(MPA_Q1D, MPA_Q1D, 1)),
         [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
           RAJA::expt::loop<outer_x>(ctx, RAJA::RangeSegment(0, NE),
             [&](int e) {
 
               MASS3DPA_0_GPU
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, D1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                 [&](int dy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, D1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                     [&](int dx) {
                       MASS3DPA_1
                     }
-                  );  // RAJA::expt::loop<inner_x> 
+                  );  // RAJA::expt::loop<inner_x>
 
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, Q1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                     [&](int dx) {
                       MASS3DPA_2
                     }
@@ -188,9 +177,9 @@ void MASS3DPA::runHipVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, D1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                 [&](int dy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, Q1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                     [&](int qx) {
                       MASS3DPA_3
                     }
@@ -200,9 +189,9 @@ void MASS3DPA::runHipVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, Q1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                 [&](int qy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, Q1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                     [&](int qx) {
                       MASS3DPA_4
                     }
@@ -212,9 +201,9 @@ void MASS3DPA::runHipVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, Q1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                 [&](int qy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, Q1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                     [&](int qx) {
                       MASS3DPA_5
                     }
@@ -224,9 +213,9 @@ void MASS3DPA::runHipVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, D1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                 [&](int d) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, Q1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                     [&](int q) {
                       MASS3DPA_6
                     }
@@ -236,9 +225,9 @@ void MASS3DPA::runHipVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, Q1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                 [&](int qy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, D1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                     [&](int dx) {
                       MASS3DPA_7
                     }
@@ -248,9 +237,9 @@ void MASS3DPA::runHipVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, D1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                 [&](int dy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, D1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                     [&](int dx) {
                       MASS3DPA_8
                     }
@@ -260,9 +249,9 @@ void MASS3DPA::runHipVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, D1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                 [&](int dy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, D1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                     [&](int dx) {
                       MASS3DPA_9
                     }
@@ -286,12 +275,14 @@ void MASS3DPA::runHipVariant(VariantID vid) {
 
   default: {
 
-    std::cout << "\n MASS3DPA : Unknown Hip variant id = " << vid << std::endl;
+    getCout() << "\n MASS3DPA : Unknown Hip variant id = " << vid << std::endl;
     break;
   }
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MASS3DPA, Hip)
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/MASS3DPA-OMP.cpp b/src/apps/MASS3DPA-OMP.cpp
index 2a7781fed..49f74774b 100644
--- a/src/apps/MASS3DPA-OMP.cpp
+++ b/src/apps/MASS3DPA-OMP.cpp
@@ -1,11 +1,14 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+// Uncomment to add compiler directives loop unrolling
+//#define USE_RAJAPERF_UNROLL
+
 #include "MASS3DPA.hpp"
 
 #include "RAJA/RAJA.hpp"
@@ -15,16 +18,8 @@
 namespace rajaperf {
 namespace apps {
 
-//#define USE_RAJA_UNROLL
-#define RAJA_DIRECT_PRAGMA(X) _Pragma(#X)
-#if defined(USE_RAJA_UNROLL)
-#define RAJA_UNROLL(N) RAJA_DIRECT_PRAGMA(unroll(N))
-#else
-#define RAJA_UNROLL(N)
-#endif
-#define FOREACH_THREAD(i, k, N) for (int i = 0; i < N; i++)
 
-void MASS3DPA::runOpenMPVariant(VariantID vid) {
+void MASS3DPA::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
 
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -43,53 +38,53 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) {
 
         MASS3DPA_0_CPU
 
-         FOREACH_THREAD(dy, y, D1D) {
-          FOREACH_THREAD(dx, x, D1D){
+         CPU_FOREACH(dy, y, MPA_D1D) {
+          CPU_FOREACH(dx, x, MPA_D1D){
             MASS3DPA_1
           }
-          FOREACH_THREAD(dx, x, Q1D) {
+          CPU_FOREACH(dx, x, MPA_Q1D) {
             MASS3DPA_2
           }
         }
 
-        FOREACH_THREAD(dy, y, D1D) {
-          FOREACH_THREAD(qx, x, Q1D) {
+        CPU_FOREACH(dy, y, MPA_D1D) {
+          CPU_FOREACH(qx, x, MPA_Q1D) {
             MASS3DPA_3
           }
         }
 
-        FOREACH_THREAD(qy, y, Q1D) {
-          FOREACH_THREAD(qx, x, Q1D) {
+        CPU_FOREACH(qy, y, MPA_Q1D) {
+          CPU_FOREACH(qx, x, MPA_Q1D) {
             MASS3DPA_4
           }
         }
 
-        FOREACH_THREAD(qy, y, Q1D) {
-          FOREACH_THREAD(qx, x, Q1D) {
+        CPU_FOREACH(qy, y, MPA_Q1D) {
+          CPU_FOREACH(qx, x, MPA_Q1D) {
             MASS3DPA_5
           }
         }
 
-        FOREACH_THREAD(d, y, D1D) {
-          FOREACH_THREAD(q, x, Q1D) {
+        CPU_FOREACH(d, y, MPA_D1D) {
+          CPU_FOREACH(q, x, MPA_Q1D) {
             MASS3DPA_6
           }
         }
 
-        FOREACH_THREAD(qy, y, Q1D) {
-          FOREACH_THREAD(dx, x, D1D) {
+        CPU_FOREACH(qy, y, MPA_Q1D) {
+          CPU_FOREACH(dx, x, MPA_D1D) {
             MASS3DPA_7
           }
         }
 
-        FOREACH_THREAD(dy, y, D1D) {
-          FOREACH_THREAD(dx, x, D1D) {
+        CPU_FOREACH(dy, y, MPA_D1D) {
+          CPU_FOREACH(dx, x, MPA_D1D) {
             MASS3DPA_8
           }
         }
 
-        FOREACH_THREAD(dy, y, D1D) {
-          FOREACH_THREAD(dx, x, D1D) {
+        CPU_FOREACH(dy, y, MPA_D1D) {
+          CPU_FOREACH(dx, x, MPA_D1D) {
             MASS3DPA_9
           }
         }
@@ -104,36 +99,20 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) {
   case RAJA_OpenMP: {
 
     //Currently Teams requires two policies if compiled with a device
-    using launch_policy = RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t
-#if defined(RAJA_DEVICE_ACTIVE)
-                                                   ,m3d_device_launch
-#endif
-                                                   >;
+    using launch_policy = RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t>;
 
-    using outer_x = RAJA::expt::LoopPolicy<RAJA::omp_for_exec
-#if defined(RAJA_DEVICE_ACTIVE)
-                                           ,m3d_gpu_block_x_policy
-#endif
-                                           >;
+    using outer_x = RAJA::expt::LoopPolicy<RAJA::omp_for_exec>;
 
-    using inner_x = RAJA::expt::LoopPolicy<RAJA::loop_exec
-#if defined(RAJA_DEVICE_ACTIVE)
-                                             ,m3d_gpu_thread_x_policy
-#endif
-                                             >;
+    using inner_x = RAJA::expt::LoopPolicy<RAJA::loop_exec>;
 
-    using inner_y = RAJA::expt::LoopPolicy<RAJA::loop_exec
-#if defined(RAJA_DEVICE_ACTIVE)
-                                             ,m3d_gpu_thread_y_policy
-#endif
-                                             >;
+    using inner_y = RAJA::expt::LoopPolicy<RAJA::loop_exec>;
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       //Grid is empty as the host does not need a compute grid to be specified
       RAJA::expt::launch<launch_policy>(
-        RAJA::expt::HOST, RAJA::expt::Grid(),
+        RAJA::expt::Grid(),
         [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
 
           RAJA::expt::loop<outer_x>(ctx, RAJA::RangeSegment(0, NE),
@@ -141,15 +120,15 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) {
 
               MASS3DPA_0_CPU
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, D1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                 [&](int dy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, D1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                     [&](int dx) {
                       MASS3DPA_1
                     }
                   );  // RAJA::expt::loop<inner_x>
 
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, Q1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                     [&](int dx) {
                       MASS3DPA_2
                     }
@@ -159,9 +138,9 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, D1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                 [&](int dy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, Q1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                     [&](int qx) {
                       MASS3DPA_3
                     }
@@ -171,9 +150,9 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, Q1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                 [&](int qy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, Q1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                     [&](int qx) {
                       MASS3DPA_4
                     }
@@ -183,9 +162,9 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, Q1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                 [&](int qy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, Q1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                     [&](int qx) {
                       MASS3DPA_5
                     }
@@ -195,21 +174,21 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, D1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                 [&](int d) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, Q1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                     [&](int q) {
                       MASS3DPA_6
                     }
                   );  // RAJA::expt::loop<inner_x>
-                }  
+                }
               );  // RAJA::expt::loop<inner_y>
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, Q1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                 [&](int qy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, D1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                     [&](int dx) {
                       MASS3DPA_7
                     }
@@ -219,9 +198,9 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, D1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                 [&](int dy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, D1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                     [&](int dx) {
                       MASS3DPA_8
                     }
@@ -231,9 +210,9 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, D1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                 [&](int dy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, D1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                     [&](int dx) {
                       MASS3DPA_9
                     }
@@ -241,8 +220,8 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) {
                 }
               );  // RAJA::expt::loop<inner_y>
 
-            }  // lambda (e) 
-          );  // RAJA::expt::loop<outer_x> 
+            }  // lambda (e)
+          );  // RAJA::expt::loop<outer_x>
 
         }  // outer lambda (ctx)
       );  // // RAJA::expt::launch
@@ -254,11 +233,11 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) {
   }
 
   default:
-    std::cout << "\n MASS3DPA : Unknown OpenMP variant id = " << vid
+    getCout() << "\n MASS3DPA : Unknown OpenMP variant id = " << vid
               << std::endl;
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/apps/MASS3DPA-OMPTarget.cpp b/src/apps/MASS3DPA-OMPTarget.cpp
index 6b61fa056..86021b52d 100644
--- a/src/apps/MASS3DPA-OMPTarget.cpp
+++ b/src/apps/MASS3DPA-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -20,14 +20,14 @@ namespace rajaperf {
 namespace apps {
 
 
-void MASS3DPA::runOpenMPTargetVariant(VariantID vid) {
+void MASS3DPA::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
   const Index_type run_reps = getRunReps();
 
   switch (vid) {
 
   default: {
 
-    std::cout << "\n MASS3DPA : Unknown OpenMPTarget variant id = " << vid << std::endl;
+    getCout() << "\n MASS3DPA : Unknown OpenMPTarget variant id = " << vid << std::endl;
     break;
   }
   }
diff --git a/src/apps/MASS3DPA-Seq.cpp b/src/apps/MASS3DPA-Seq.cpp
index 027a9dacb..d276c4a57 100644
--- a/src/apps/MASS3DPA-Seq.cpp
+++ b/src/apps/MASS3DPA-Seq.cpp
@@ -1,11 +1,14 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+// Uncomment to add compiler directives for loop unrolling
+//#define USE_RAJAPERF_UNROLL
+
 #include "MASS3DPA.hpp"
 
 #include "RAJA/RAJA.hpp"
@@ -15,16 +18,8 @@
 namespace rajaperf {
 namespace apps {
 
-//#define USE_RAJA_UNROLL
-#define RAJA_DIRECT_PRAGMA(X) _Pragma(#X)
-#if defined(USE_RAJA_UNROLL)
-#define RAJA_UNROLL(N) RAJA_DIRECT_PRAGMA(unroll(N))
-#else
-#define RAJA_UNROLL(N)
-#endif
-#define FOREACH_THREAD(i, k, N) for (int i = 0; i < N; i++)
 
-void MASS3DPA::runSeqVariant(VariantID vid) {
+void MASS3DPA::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
   const Index_type run_reps = getRunReps();
 
   MASS3DPA_DATA_SETUP;
@@ -40,53 +35,53 @@ void MASS3DPA::runSeqVariant(VariantID vid) {
 
         MASS3DPA_0_CPU
 
-        FOREACH_THREAD(dy, y, D1D) {
-          FOREACH_THREAD(dx, x, D1D){
+        CPU_FOREACH(dy, y, MPA_D1D) {
+          CPU_FOREACH(dx, x, MPA_D1D){
             MASS3DPA_1
           }
-          FOREACH_THREAD(dx, x, Q1D) {
+          CPU_FOREACH(dx, x, MPA_Q1D) {
             MASS3DPA_2
           }
         }
 
-        FOREACH_THREAD(dy, y, D1D) {
-          FOREACH_THREAD(qx, x, Q1D) {
+        CPU_FOREACH(dy, y, MPA_D1D) {
+          CPU_FOREACH(qx, x, MPA_Q1D) {
             MASS3DPA_3
           }
         }
 
-        FOREACH_THREAD(qy, y, Q1D) {
-          FOREACH_THREAD(qx, x, Q1D) {
+        CPU_FOREACH(qy, y, MPA_Q1D) {
+          CPU_FOREACH(qx, x, MPA_Q1D) {
             MASS3DPA_4
           }
         }
 
-        FOREACH_THREAD(qy, y, Q1D) {
-          FOREACH_THREAD(qx, x, Q1D) {
+        CPU_FOREACH(qy, y, MPA_Q1D) {
+          CPU_FOREACH(qx, x, MPA_Q1D) {
             MASS3DPA_5
           }
         }
 
-        FOREACH_THREAD(d, y, D1D) {
-          FOREACH_THREAD(q, x, Q1D) {
+        CPU_FOREACH(d, y, MPA_D1D) {
+          CPU_FOREACH(q, x, MPA_Q1D) {
             MASS3DPA_6
           }
         }
 
-        FOREACH_THREAD(qy, y, Q1D) {
-          FOREACH_THREAD(dx, x, D1D) {
+        CPU_FOREACH(qy, y, MPA_Q1D) {
+          CPU_FOREACH(dx, x, MPA_D1D) {
             MASS3DPA_7
           }
         }
 
-        FOREACH_THREAD(dy, y, D1D) {
-          FOREACH_THREAD(dx, x, D1D) {
+        CPU_FOREACH(dy, y, MPA_D1D) {
+          CPU_FOREACH(dx, x, MPA_D1D) {
             MASS3DPA_8
           }
         }
 
-        FOREACH_THREAD(dy, y, D1D) {
-          FOREACH_THREAD(dx, x, D1D) {
+        CPU_FOREACH(dy, y, MPA_D1D) {
+          CPU_FOREACH(dx, x, MPA_D1D) {
             MASS3DPA_9
           }
         }
@@ -102,51 +97,35 @@ void MASS3DPA::runSeqVariant(VariantID vid) {
   case RAJA_Seq: {
 
     //Currently Teams requires two policies if compiled with a device
-    using launch_policy = RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t
-#if defined(RAJA_DEVICE_ACTIVE)
-                                                   ,m3d_device_launch
-#endif
-                                                   >;
-
-    using outer_x = RAJA::expt::LoopPolicy<RAJA::loop_exec
-#if defined(RAJA_DEVICE_ACTIVE)
-                                           ,m3d_gpu_block_x_policy
-#endif
-                                           >;
-
-    using inner_x = RAJA::expt::LoopPolicy<RAJA::loop_exec
-#if defined(RAJA_DEVICE_ACTIVE)
-                                             ,m3d_gpu_thread_x_policy
-#endif
-                                             >;
-
-    using inner_y = RAJA::expt::LoopPolicy<RAJA::loop_exec
-#if defined(RAJA_DEVICE_ACTIVE)
-                                             ,m3d_gpu_thread_y_policy
-#endif
-                                             >;
+    using launch_policy = RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t>;
+
+    using outer_x = RAJA::expt::LoopPolicy<RAJA::loop_exec>;
+
+    using inner_x = RAJA::expt::LoopPolicy<RAJA::loop_exec>;
+
+    using inner_y = RAJA::expt::LoopPolicy<RAJA::loop_exec>;
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       RAJA::expt::launch<launch_policy>(
-        RAJA::expt::HOST, RAJA::expt::Grid(),
+        RAJA::expt::Grid(),
         [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
 
-          RAJA::expt::loop<outer_x>(ctx, RAJA::RangeSegment(0, NE), 
+          RAJA::expt::loop<outer_x>(ctx, RAJA::RangeSegment(0, NE),
             [&](int e) {
 
               MASS3DPA_0_CPU
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, D1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                 [&](int dy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, D1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                     [&](int dx) {
                       MASS3DPA_1
                     }
                   );  // RAJA::expt::loop<inner_x>
 
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, Q1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                     [&](int dx) {
                       MASS3DPA_2
                     }
@@ -156,9 +135,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, D1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                 [&](int dy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, Q1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                     [&](int qx) {
                       MASS3DPA_3
                     }
@@ -168,9 +147,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, Q1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                 [&](int qy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, Q1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                     [&](int qx) {
                       MASS3DPA_4
                     }
@@ -180,9 +159,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, Q1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                 [&](int qy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, Q1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                     [&](int qx) {
                       MASS3DPA_5
                     }
@@ -192,9 +171,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, D1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                 [&](int d) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, Q1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                     [&](int q) {
                       MASS3DPA_6
                     }
@@ -204,9 +183,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, Q1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
                 [&](int qy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, D1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                     [&](int dx) {
                       MASS3DPA_7
                     }
@@ -216,9 +195,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, D1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                 [&](int dy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, D1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                     [&](int dx) {
                       MASS3DPA_8
                     }
@@ -228,9 +207,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) {
 
               ctx.teamSync();
 
-              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, D1D),
+              RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                 [&](int dy) {
-                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, D1D),
+                  RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_D1D),
                     [&](int dx) {
                       MASS3DPA_9
                     }
@@ -252,7 +231,7 @@ void MASS3DPA::runSeqVariant(VariantID vid) {
 #endif // RUN_RAJA_SEQ
 
   default:
-    std::cout << "\n MASS3DPA : Unknown Seq variant id = " << vid << std::endl;
+    getCout() << "\n MASS3DPA : Unknown Seq variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp
index 6057127dc..288e7ff82 100644
--- a/src/apps/MASS3DPA.cpp
+++ b/src/apps/MASS3DPA.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -25,28 +25,28 @@ MASS3DPA::MASS3DPA(const RunParams& params)
 {
   m_NE_default = 8000;
 
-  setDefaultProblemSize(m_NE_default*Q1D*Q1D*Q1D);
+  setDefaultProblemSize(m_NE_default*MPA_Q1D*MPA_Q1D*MPA_Q1D);
   setDefaultReps(50);
 
-  m_NE = std::max(getTargetProblemSize()/(Q1D*Q1D*Q1D), Index_type(1));
+  m_NE = std::max(getTargetProblemSize()/(MPA_Q1D*MPA_Q1D*MPA_Q1D), Index_type(1));
 
-  setActualProblemSize( m_NE*Q1D*Q1D*Q1D );
+  setActualProblemSize( m_NE*MPA_Q1D*MPA_Q1D*MPA_Q1D );
 
   setItsPerRep(getActualProblemSize());
   setKernelsPerRep(1);
 
-  setBytesPerRep( Q1D*D1D*sizeof(Real_type)  +
-                  Q1D*D1D*sizeof(Real_type)  +
-                  Q1D*Q1D*Q1D*m_NE*sizeof(Real_type) +
-                  D1D*D1D*D1D*m_NE*sizeof(Real_type) +
-                  D1D*D1D*D1D*m_NE*sizeof(Real_type) );
-
-  setFLOPsPerRep(m_NE * (2 * D1D * D1D * D1D * Q1D +
-                         2 * D1D * D1D * Q1D * Q1D +
-                         2 * D1D * Q1D * Q1D * Q1D + Q1D * Q1D * Q1D +
-                         2 * Q1D * Q1D * Q1D * D1D +
-                         2 * Q1D * Q1D * D1D * D1D +
-                         2 * Q1D * D1D * D1D * D1D + D1D * D1D * D1D));
+  setBytesPerRep( MPA_Q1D*MPA_D1D*sizeof(Real_type)  +
+                  MPA_Q1D*MPA_D1D*sizeof(Real_type)  +
+                  MPA_Q1D*MPA_Q1D*MPA_Q1D*m_NE*sizeof(Real_type) +
+                  MPA_D1D*MPA_D1D*MPA_D1D*m_NE*sizeof(Real_type) +
+                  MPA_D1D*MPA_D1D*MPA_D1D*m_NE*sizeof(Real_type) );
+
+  setFLOPsPerRep(m_NE * (2 * MPA_D1D * MPA_D1D * MPA_D1D * MPA_Q1D +
+                         2 * MPA_D1D * MPA_D1D * MPA_Q1D * MPA_Q1D +
+                         2 * MPA_D1D * MPA_Q1D * MPA_Q1D * MPA_Q1D + MPA_Q1D * MPA_Q1D * MPA_Q1D +
+                         2 * MPA_Q1D * MPA_Q1D * MPA_Q1D * MPA_D1D +
+                         2 * MPA_Q1D * MPA_Q1D * MPA_D1D * MPA_D1D +
+                         2 * MPA_Q1D * MPA_D1D * MPA_D1D * MPA_D1D + MPA_D1D * MPA_D1D * MPA_D1D));
   setUsesFeature(Teams);
 
   setVariantDefined( Base_Seq );
@@ -67,22 +67,22 @@ MASS3DPA::~MASS3DPA()
 {
 }
 
-void MASS3DPA::setUp(VariantID vid)
+void MASS3DPA::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 
-  allocAndInitDataConst(m_B, int(Q1D*D1D), Real_type(1.0), vid);
-  allocAndInitDataConst(m_Bt,int(Q1D*D1D), Real_type(1.0), vid);
-  allocAndInitDataConst(m_D, int(Q1D*Q1D*Q1D*m_NE), Real_type(1.0), vid);
-  allocAndInitDataConst(m_X, int(D1D*D1D*D1D*m_NE), Real_type(1.0), vid);
-  allocAndInitDataConst(m_Y, int(D1D*D1D*D1D*m_NE), Real_type(0.0), vid);
+  allocAndInitDataConst(m_B, int(MPA_Q1D*MPA_D1D), Real_type(1.0), vid);
+  allocAndInitDataConst(m_Bt,int(MPA_Q1D*MPA_D1D), Real_type(1.0), vid);
+  allocAndInitDataConst(m_D, int(MPA_Q1D*MPA_Q1D*MPA_Q1D*m_NE), Real_type(1.0), vid);
+  allocAndInitDataConst(m_X, int(MPA_D1D*MPA_D1D*MPA_D1D*m_NE), Real_type(1.0), vid);
+  allocAndInitDataConst(m_Y, int(MPA_D1D*MPA_D1D*MPA_D1D*m_NE), Real_type(0.0), vid);
 }
 
-void MASS3DPA::updateChecksum(VariantID vid)
+void MASS3DPA::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_Y, D1D*D1D*D1D*m_NE);
+  checksum[vid][tune_idx] += calcChecksum(m_Y, MPA_D1D*MPA_D1D*MPA_D1D*m_NE);
 }
 
-void MASS3DPA::tearDown(VariantID vid)
+void MASS3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
 
diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp
index 9e2255fdd..0d1c3a42d 100644
--- a/src/apps/MASS3DPA.hpp
+++ b/src/apps/MASS3DPA.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -7,7 +7,7 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Action of 3D Mass matrix via partial assembly
+/// Action of 3D mass matrix via partial assembly
 ///
 /// Based on MFEM's/CEED algorithms.
 /// Reference implementation
@@ -15,8 +15,8 @@
 ///
 /// for (int e = 0; e < NE; ++e) {
 ///
-///   constexpr int MQ1 = Q1D;
-///   constexpr int MD1 = D1D;
+///   constexpr int MQ1 = MPA_Q1D;
+///   constexpr int MD1 = MPA_D1D;
 ///   constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
 ///   double sDQ[MQ1 * MD1];
 ///   double(*Bsmem)[MD1] = (double(*)[MD1])sDQ;
@@ -30,120 +30,120 @@
 ///   double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0;
 ///   double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1;
 ///
-///   for(int dy=0; dy<D1D; ++dy) {
-///     for(int dx=0; dx<D1D; ++dx) {
-///       for (int dz = 0; dz< D1D; ++dz) {
+///   for(int dy=0; dy<MPA_D1D; ++dy) {
+///     for(int dx=0; dx<MPA_D1D; ++dx) {
+///       for (int dz = 0; dz< MPA_D1D; ++dz) {
 ///         Xsmem[dz][dy][dx] = X_(dx, dy, dz, e);
 ///       }
 ///     }
-///     for(int dx=0; dx<Q1D; ++dx) {
+///     for(int dx=0; dx<MPA_Q1D; ++dx) {
 ///      Bsmem[dx][dy] = B_(dx, dy);
 ///     }
 ///   }
 ///
-///   for(int dy=0; dy<D1D; ++dy) {
-///     for(int dx=0; dx<Q1D; ++dx) {
-///       double u[D1D];
-///       for (int dz = 0; dz < D1D; dz++) {
+///   for(int dy=0; dy<MPA_D1D; ++dy) {
+///     for(int dx=0; dx<MPA_Q1D; ++dx) {
+///       double u[MPA_D1D];
+///       for (int dz = 0; dz < MPA_D1D; dz++) {
 ///           u[dz] = 0;
 ///       }
-///       for (int dx = 0; dx < D1D; ++dx) {
-///         for (int dz = 0; dz < D1D; ++dz) {
+///       for (int dx = 0; dx < MPA_D1D; ++dx) {
+///         for (int dz = 0; dz < MPA_D1D; ++dz) {
 ///           u[dz] += Xsmem[dz][dy][dx] * Bsmem[qx][dx];
 ///          }
 ///       }
-///       for (int dz = 0; dz < D1D; ++dz) {
+///       for (int dz = 0; dz < MPA_D1D; ++dz) {
 ///         DDQ[dz][dy][qx] = u[dz];
 ///       }
 ///     }
 ///   }
 ///
-///   for(int qy=0; qy<Q1D; ++qy) {
-///     for(int qx=0; qx<Q1D; ++qx) {
-///       double u[D1D];
-///       for (int dz = 0; dz < D1D; dz++) {
+///   for(int qy=0; qy<MPA_Q1D; ++qy) {
+///     for(int qx=0; qx<MPA_Q1D; ++qx) {
+///       double u[MPA_D1D];
+///       for (int dz = 0; dz < MPA_D1D; dz++) {
 ///         u[dz] = 0;
 ///       }
-///       for (int dy = 0; dy < D1D; ++dy) {
-///         for (int dz = 0; dz < D1D; dz++) {
+///       for (int dy = 0; dy < MPA_D1D; ++dy) {
+///         for (int dz = 0; dz < MPA_D1D; dz++) {
 ///           u[dz] += DDQ[dz][dy][qx] * Bsmem[qy][dy];
 ///         }
 ///       }
-///       for (int dz = 0; dz < D1D; dz++) {
+///       for (int dz = 0; dz < MPA_D1D; dz++) {
 ///         DQQ[dz][qy][qx] = u[dz];
 ///       }
 ///     }
 ///   }
 ///
-///   for(int qy=0; qy<Q1D; ++qy) {
-///     for(int qx=0; qx<Q1D; ++qx) {
-///       double u[Q1D];
-///       for (int qz = 0; qz < Q1D; qz++) {
+///   for(int qy=0; qy<MPA_Q1D; ++qy) {
+///     for(int qx=0; qx<MPA_Q1D; ++qx) {
+///       double u[MPA_Q1D];
+///       for (int qz = 0; qz < MPA_Q1D; qz++) {
 ///         u[qz] = 0;
 ///       }
-///       for (int dz = 0; dz < D1D; ++dz) {
-///         for (int qz = 0; qz < Q1D; qz++) {
+///       for (int dz = 0; dz < MPA_D1D; ++dz) {
+///         for (int qz = 0; qz < MPA_Q1D; qz++) {
 ///            u[qz] += DQQ[dz][qy][qx] * Bsmem[qz][dz];
 ///          }
 ///       }
-///       for (int qz = 0; qz < Q1D; qz++) {
+///       for (int qz = 0; qz < MPA_Q1D; qz++) {
 ///         QQQ[qz][qy][qx] = u[qz] * D_(qx, qy, qz, e);
 ///       }
 ///     }
 ///   }
 ///
-///   for(int d=0; d<D1D; ++d) {
-///     for(int q=0; q<Q1D; ++q) {
+///   for(int d=0; d<MPA_D1D; ++d) {
+///     for(int q=0; q<MPA_Q1D; ++q) {
 ///       Btsmem[d][q] = Bt_(q, d);
 ///     }
 ///   }
 ///
-///   for(int qy=0; qy<Q1D; ++qy) {
-///     for(int dx=0; dx<D1D; ++dx) {
-///       double u[Q1D];
-///       for (int qz = 0; qz < Q1D; ++qz) {
+///   for(int qy=0; qy<MPA_Q1D; ++qy) {
+///     for(int dx=0; dx<MPA_D1D; ++dx) {
+///       double u[MPA_Q1D];
+///       for (int qz = 0; qz < MPA_Q1D; ++qz) {
 ///         u[qz] = 0;
 ///       }
-///       for (int qx = 0; qx < Q1D; ++qx) {
-///         for (int qz = 0; qz < Q1D; ++qz) {
+///       for (int qx = 0; qx < MPA_Q1D; ++qx) {
+///         for (int qz = 0; qz < MPA_Q1D; ++qz) {
 ///           u[qz] += QQQ[qz][qy][qx] * Btsmem[dx][qx];
 ///         }
 ///       }
-///       for (int qz = 0; qz < Q1D; ++qz) {
+///       for (int qz = 0; qz < MPA_Q1D; ++qz) {
 ///          QQD[qz][qy][dx] = u[qz];
 ///       }
 ///     }
 ///   }
 ///
-///   for(int dy=0; dy<D1D; ++dy) {
-///     for(int dx=0; dx<D1D; ++dx) {
-///       double u[Q1D];
-///       for (int qz = 0; qz < Q1D; ++qz) {
+///   for(int dy=0; dy<MPA_D1D; ++dy) {
+///     for(int dx=0; dx<MPA_D1D; ++dx) {
+///       double u[MPA_Q1D];
+///       for (int qz = 0; qz < MPA_Q1D; ++qz) {
 ///          u[qz] = 0;
 ///       }
-///       for (int qy = 0; qy < Q1D; ++qy) {
-///         for (int qz = 0; qz < Q1D; ++qz) {
+///       for (int qy = 0; qy < MPA_Q1D; ++qy) {
+///         for (int qz = 0; qz < MPA_Q1D; ++qz) {
 ///           u[qz] += QQD[qz][qy][dx] * Btsmem[dy][qy];
 ///          }
 ///       }
-///       for (int qz = 0; qz < Q1D; ++qz) {
+///       for (int qz = 0; qz < MPA_Q1D; ++qz) {
 ///         QDD[qz][dy][dx] = u[qz];
 ///       }
 ///     }
 ///   }
 ///
-///   for(int dy=0; dy<D1D; ++dy) {
-///     for(int dx=0; dx<D1D; ++dx) {
-///       double u[D1D];
-///       for (int dz = 0; dz < D1D; ++dz) {
+///   for(int dy=0; dy<MPA_D1D; ++dy) {
+///     for(int dx=0; dx<MPA_D1D; ++dx) {
+///       double u[MPA_D1D];
+///       for (int dz = 0; dz < MPA_D1D; ++dz) {
 ///        u[dz] = 0;
 ///       }
-///       for (int qz = 0; qz < Q1D; ++qz) {
-///         for (int dz = 0; dz < D1D; ++dz) {
+///       for (int qz = 0; qz < MPA_Q1D; ++qz) {
+///         for (int dz = 0; dz < MPA_D1D; ++dz) {
 ///            u[dz] += QDD[qz][dy][dx] * Btsmem[dz][qz];
 ///          }
 ///       }
-///       for (int dz = 0; dz < D1D; ++dz) {
+///       for (int dz = 0; dz < MPA_D1D; ++dz) {
 ///         Y_(dx, dy, dz, e) += u[dz];
 ///       }
 ///     }
@@ -164,24 +164,25 @@ Real_ptr Y = m_Y; \
 Index_type NE = m_NE;
 
 #include "common/KernelBase.hpp"
+#include "FEM_MACROS.hpp"
 
 #include "RAJA/RAJA.hpp"
 
 //Number of Dofs/Qpts in 1D
-#define D1D 4
-#define Q1D 5
-#define B_(x, y) B[x + Q1D * y]
-#define Bt_(x, y) Bt[x + D1D * y]
+#define MPA_D1D 4
+#define MPA_Q1D 5
+#define B_(x, y) B[x + MPA_Q1D * y]
+#define Bt_(x, y) Bt[x + MPA_D1D * y]
 #define X_(dx, dy, dz, e)                                                      \
-  X[dx + D1D * dy + D1D * D1D * dz + D1D * D1D * D1D * e]
+  X[dx + MPA_D1D * dy + MPA_D1D * MPA_D1D * dz + MPA_D1D * MPA_D1D * MPA_D1D * e]
 #define Y_(dx, dy, dz, e)                                                      \
-  Y[dx + D1D * dy + D1D * D1D * dz + D1D * D1D * D1D * e]
+  Y[dx + MPA_D1D * dy + MPA_D1D * MPA_D1D * dz + MPA_D1D * MPA_D1D * MPA_D1D * e]
 #define D_(qx, qy, qz, e)                                                      \
-  D[qx + Q1D * qy + Q1D * Q1D * qz + Q1D * Q1D * Q1D * e]
+  D[qx + MPA_Q1D * qy + MPA_Q1D * MPA_Q1D * qz + MPA_Q1D * MPA_Q1D * MPA_Q1D * e]
 
 #define MASS3DPA_0_CPU           \
-        constexpr int MQ1 = Q1D; \
-        constexpr int MD1 = D1D; \
+        constexpr int MQ1 = MPA_Q1D; \
+        constexpr int MD1 = MPA_D1D; \
         constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \
         double sDQ[MQ1 * MD1]; \
         double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; \
@@ -196,8 +197,8 @@ Index_type NE = m_NE;
         double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1;
 
 #define MASS3DPA_0_GPU \
-        constexpr int MQ1 = Q1D; \
-        constexpr int MD1 = D1D; \
+        constexpr int MQ1 = MPA_Q1D; \
+        constexpr int MD1 = MPA_D1D; \
         constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \
         RAJA_TEAM_SHARED  double sDQ[MQ1 * MD1];     \
         double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; \
@@ -212,146 +213,132 @@ Index_type NE = m_NE;
         double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1;
 
 #define MASS3DPA_1 \
-  RAJA_UNROLL(MD1) \
-for (int dz = 0; dz< D1D; ++dz) { \
+  RAJAPERF_UNROLL(MD1) \
+for (int dz = 0; dz< MPA_D1D; ++dz) { \
 Xsmem[dz][dy][dx] = X_(dx, dy, dz, e); \
 }
 
 #define MASS3DPA_2 \
   Bsmem[dx][dy] = B_(dx, dy);
 
-// 2 * D1D * D1D * D1D * Q1D
+// 2 * MPA_D1D * MPA_D1D * MPA_D1D * MPA_Q1D
 #define MASS3DPA_3 \
-  double u[D1D]; \
-RAJA_UNROLL(MD1) \
-for (int dz = 0; dz < D1D; dz++) { \
+  double u[MPA_D1D]; \
+RAJAPERF_UNROLL(MD1) \
+for (int dz = 0; dz < MPA_D1D; dz++) { \
 u[dz] = 0; \
 } \
-RAJA_UNROLL(MD1) \
-for (int dx = 0; dx < D1D; ++dx) { \
-RAJA_UNROLL(MD1) \
-for (int dz = 0; dz < D1D; ++dz) { \
+RAJAPERF_UNROLL(MD1) \
+for (int dx = 0; dx < MPA_D1D; ++dx) { \
+RAJAPERF_UNROLL(MD1) \
+for (int dz = 0; dz < MPA_D1D; ++dz) { \
 u[dz] += Xsmem[dz][dy][dx] * Bsmem[qx][dx]; \
 } \
 } \
-RAJA_UNROLL(MD1) \
-for (int dz = 0; dz < D1D; ++dz) { \
+RAJAPERF_UNROLL(MD1) \
+for (int dz = 0; dz < MPA_D1D; ++dz) { \
 DDQ[dz][dy][qx] = u[dz]; \
 }
 
-//2 * D1D * D1D * Q1D * Q1D
+//2 * MPA_D1D * MPA_D1D * MPA_Q1D * MPA_Q1D
 #define MASS3DPA_4 \
-            double u[D1D]; \
-            RAJA_UNROLL(MD1) \
-            for (int dz = 0; dz < D1D; dz++) { \
+            double u[MPA_D1D]; \
+            RAJAPERF_UNROLL(MD1) \
+            for (int dz = 0; dz < MPA_D1D; dz++) { \
               u[dz] = 0; \
             } \
-            RAJA_UNROLL(MD1) \
-            for (int dy = 0; dy < D1D; ++dy) { \
-              RAJA_UNROLL(MD1) \
-              for (int dz = 0; dz < D1D; dz++) { \
+            RAJAPERF_UNROLL(MD1) \
+            for (int dy = 0; dy < MPA_D1D; ++dy) { \
+              RAJAPERF_UNROLL(MD1) \
+              for (int dz = 0; dz < MPA_D1D; dz++) { \
                 u[dz] += DDQ[dz][dy][qx] * Bsmem[qy][dy]; \
               } \
             } \
-            RAJA_UNROLL(MD1) \
-            for (int dz = 0; dz < D1D; dz++) { \
+            RAJAPERF_UNROLL(MD1) \
+            for (int dz = 0; dz < MPA_D1D; dz++) { \
               DQQ[dz][qy][qx] = u[dz]; \
             }
 
-//2 * D1D * Q1D * Q1D * Q1D + Q1D * Q1D * Q1D
+//2 * MPA_D1D * MPA_Q1D * MPA_Q1D * MPA_Q1D + MPA_Q1D * MPA_Q1D * MPA_Q1D
 #define MASS3DPA_5 \
-            double u[Q1D]; \
-            RAJA_UNROLL(MQ1) \
-            for (int qz = 0; qz < Q1D; qz++) { \
+            double u[MPA_Q1D]; \
+            RAJAPERF_UNROLL(MQ1) \
+            for (int qz = 0; qz < MPA_Q1D; qz++) { \
               u[qz] = 0; \
             } \
-            RAJA_UNROLL(MD1) \
-            for (int dz = 0; dz < D1D; ++dz) { \
-              RAJA_UNROLL(MQ1) \
-              for (int qz = 0; qz < Q1D; qz++) { \
+            RAJAPERF_UNROLL(MD1) \
+            for (int dz = 0; dz < MPA_D1D; ++dz) { \
+              RAJAPERF_UNROLL(MQ1) \
+              for (int qz = 0; qz < MPA_Q1D; qz++) { \
                 u[qz] += DQQ[dz][qy][qx] * Bsmem[qz][dz]; \
               } \
             } \
-            RAJA_UNROLL(MQ1) \
-            for (int qz = 0; qz < Q1D; qz++) { \
+            RAJAPERF_UNROLL(MQ1) \
+            for (int qz = 0; qz < MPA_Q1D; qz++) { \
               QQQ[qz][qy][qx] = u[qz] * D_(qx, qy, qz, e); \
             }
 
 #define MASS3DPA_6 \
   Btsmem[d][q] = Bt_(q, d);
 
-//2 * Q1D * Q1D * Q1D * D1D
+//2 * MPA_Q1D * MPA_Q1D * MPA_Q1D * MPA_D1D
 #define MASS3DPA_7 \
-  double u[Q1D]; \
-RAJA_UNROLL(MQ1) \
-for (int qz = 0; qz < Q1D; ++qz) { \
+  double u[MPA_Q1D]; \
+RAJAPERF_UNROLL(MQ1) \
+for (int qz = 0; qz < MPA_Q1D; ++qz) { \
   u[qz] = 0; \
  } \
-RAJA_UNROLL(MQ1) \
-for (int qx = 0; qx < Q1D; ++qx) { \
-  RAJA_UNROLL(MQ1) \
-    for (int qz = 0; qz < Q1D; ++qz) { \
+RAJAPERF_UNROLL(MQ1) \
+for (int qx = 0; qx < MPA_Q1D; ++qx) { \
+  RAJAPERF_UNROLL(MQ1) \
+    for (int qz = 0; qz < MPA_Q1D; ++qz) { \
       u[qz] += QQQ[qz][qy][qx] * Btsmem[dx][qx]; \
     } \
  } \
-RAJA_UNROLL(MQ1) \
-for (int qz = 0; qz < Q1D; ++qz) { \
+RAJAPERF_UNROLL(MQ1) \
+for (int qz = 0; qz < MPA_Q1D; ++qz) { \
   QQD[qz][qy][dx] = u[qz]; \
  }
 
-// 2 * Q1D * Q1D * D1D * D1D
+// 2 * MPA_Q1D * MPA_Q1D * MPA_D1D * MPA_D1D
 #define MASS3DPA_8 \
-            double u[Q1D]; \
-            RAJA_UNROLL(MQ1) \
-            for (int qz = 0; qz < Q1D; ++qz) { \
+            double u[MPA_Q1D]; \
+            RAJAPERF_UNROLL(MQ1) \
+            for (int qz = 0; qz < MPA_Q1D; ++qz) { \
               u[qz] = 0; \
             } \
-            RAJA_UNROLL(MQ1) \
-            for (int qy = 0; qy < Q1D; ++qy) { \
-              RAJA_UNROLL(MQ1) \
-              for (int qz = 0; qz < Q1D; ++qz) { \
+            RAJAPERF_UNROLL(MQ1) \
+            for (int qy = 0; qy < MPA_Q1D; ++qy) { \
+              RAJAPERF_UNROLL(MQ1) \
+              for (int qz = 0; qz < MPA_Q1D; ++qz) { \
                 u[qz] += QQD[qz][qy][dx] * Btsmem[dy][qy]; \
               } \
             } \
-            RAJA_UNROLL(MQ1) \
-            for (int qz = 0; qz < Q1D; ++qz) { \
+            RAJAPERF_UNROLL(MQ1) \
+            for (int qz = 0; qz < MPA_Q1D; ++qz) { \
               QDD[qz][dy][dx] = u[qz]; \
             }
 
-//2 * Q1D * D1D * D1D * D1D + D1D * D1D * D1D
+//2 * MPA_Q1D * MPA_D1D * MPA_D1D * MPA_D1D + MPA_D1D * MPA_D1D * MPA_D1D
 #define MASS3DPA_9 \
-            double u[D1D]; \
-            RAJA_UNROLL(MD1) \
-            for (int dz = 0; dz < D1D; ++dz) { \
+            double u[MPA_D1D]; \
+            RAJAPERF_UNROLL(MD1) \
+            for (int dz = 0; dz < MPA_D1D; ++dz) { \
               u[dz] = 0; \
             } \
-            RAJA_UNROLL(MQ1) \
-            for (int qz = 0; qz < Q1D; ++qz) { \
-              RAJA_UNROLL(MD1) \
-              for (int dz = 0; dz < D1D; ++dz) { \
+            RAJAPERF_UNROLL(MQ1) \
+            for (int qz = 0; qz < MPA_Q1D; ++qz) { \
+              RAJAPERF_UNROLL(MD1) \
+              for (int dz = 0; dz < MPA_D1D; ++dz) { \
                 u[dz] += QDD[qz][dy][dx] * Btsmem[dz][qz]; \
               } \
             } \
-            RAJA_UNROLL(MD1) \
-            for (int dz = 0; dz < D1D; ++dz) { \
+            RAJAPERF_UNROLL(MD1) \
+            for (int dz = 0; dz < MPA_D1D; ++dz) { \
               Y_(dx, dy, dz, e) += u[dz]; \
             }
 
 
-#if defined(RAJA_ENABLE_CUDA)
-  using m3d_device_launch = RAJA::expt::cuda_launch_t<true>;
-  using m3d_gpu_block_x_policy = RAJA::cuda_block_x_direct;
-  using m3d_gpu_thread_x_policy = RAJA::cuda_thread_x_loop;
-  using m3d_gpu_thread_y_policy = RAJA::cuda_thread_y_loop;
-#endif
-
-#if defined(RAJA_ENABLE_HIP)
-  using m3d_device_launch = RAJA::expt::hip_launch_t<true>;
-  using m3d_gpu_block_x_policy = RAJA::hip_block_x_direct;
-  using m3d_gpu_thread_x_policy = RAJA::hip_thread_x_loop;
-  using m3d_gpu_thread_y_policy = RAJA::hip_thread_y_loop;
-#endif
-
 namespace rajaperf
 {
 class RunParams;
@@ -367,17 +354,26 @@ class MASS3DPA : public KernelBase
 
   ~MASS3DPA();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = MPA_Q1D * MPA_Q1D;
+  using gpu_block_sizes_type = gpu_block_size::list_type<default_gpu_block_size>;
 
   Real_ptr m_B;
   Real_ptr m_Bt;
diff --git a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp
new file mode 100644
index 000000000..e8aadcb2b
--- /dev/null
+++ b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp
@@ -0,0 +1,121 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "NODAL_ACCUMULATION_3D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include "AppsData.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace apps
+{
+
+#define NODAL_ACCUMULATION_3D_DATA_SETUP_CUDA \
+  allocAndInitCudaDeviceData(x, m_x, m_nodal_array_length); \
+  allocAndInitCudaDeviceData(vol, m_vol, m_zonal_array_length); \
+  allocAndInitCudaDeviceData(real_zones, m_domain->real_zones, iend);
+
+#define NODAL_ACCUMULATION_3D_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_x, x, m_nodal_array_length); \
+  deallocCudaDeviceData(x); \
+  deallocCudaDeviceData(vol); \
+  deallocCudaDeviceData(real_zones);
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void nodal_accumulation_3d(Real_ptr vol,
+                      Real_ptr x0, Real_ptr x1,
+                      Real_ptr x2, Real_ptr x3,
+                      Real_ptr x4, Real_ptr x5,
+                      Real_ptr x6, Real_ptr x7,
+                      Index_ptr real_zones,
+                      Index_type ibegin, Index_type iend)
+{
+   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = ii + ibegin;
+   if (i < iend) {
+     NODAL_ACCUMULATION_3D_BODY_INDEX;
+     NODAL_ACCUMULATION_3D_RAJA_ATOMIC_BODY(RAJA::cuda_atomic);
+   }
+}
+
+
+template < size_t block_size >
+void NODAL_ACCUMULATION_3D::runCudaVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = m_domain->n_real_zones;
+
+  NODAL_ACCUMULATION_3D_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    NODAL_ACCUMULATION_3D_DATA_SETUP_CUDA;
+
+    NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+
+      nodal_accumulation_3d<block_size><<<grid_size, block_size>>>(vol,
+                                       x0, x1, x2, x3, x4, x5, x6, x7,
+                                       real_zones,
+                                       ibegin, iend);
+      cudaErrchk( cudaGetLastError() );
+
+    }
+    stopTimer();
+
+    NODAL_ACCUMULATION_3D_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    NODAL_ACCUMULATION_3D_DATA_SETUP_CUDA;
+
+    NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
+
+    camp::resources::Resource working_res{camp::resources::Cuda()};
+    RAJA::TypedListSegment<Index_type> zones(m_domain->real_zones,
+                                             m_domain->n_real_zones,
+                                             working_res);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+        zones, [=] __device__ (Index_type i) {
+          NODAL_ACCUMULATION_3D_RAJA_ATOMIC_BODY(RAJA::cuda_atomic);
+      });
+
+    }
+    stopTimer();
+
+    NODAL_ACCUMULATION_3D_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  NODAL_ACCUMULATION_3D : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(NODAL_ACCUMULATION_3D, Cuda)
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp
new file mode 100644
index 000000000..09cea6211
--- /dev/null
+++ b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp
@@ -0,0 +1,121 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "NODAL_ACCUMULATION_3D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#include "common/HipDataUtils.hpp"
+
+#include "AppsData.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace apps
+{
+
+#define NODAL_ACCUMULATION_3D_DATA_SETUP_HIP \
+  allocAndInitHipDeviceData(x, m_x, m_nodal_array_length); \
+  allocAndInitHipDeviceData(vol, m_vol, m_zonal_array_length); \
+  allocAndInitHipDeviceData(real_zones, m_domain->real_zones, iend);
+
+#define NODAL_ACCUMULATION_3D_DATA_TEARDOWN_HIP \
+  getHipDeviceData(m_x, x, m_nodal_array_length); \
+  deallocHipDeviceData(x); \
+  deallocHipDeviceData(vol); \
+  deallocHipDeviceData(real_zones);
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void nodal_accumulation_3d(Real_ptr vol,
+                      Real_ptr x0, Real_ptr x1,
+                      Real_ptr x2, Real_ptr x3,
+                      Real_ptr x4, Real_ptr x5,
+                      Real_ptr x6, Real_ptr x7,
+                      Index_ptr real_zones,
+                      Index_type ibegin, Index_type iend)
+{
+   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = ii + ibegin;
+   if (i < iend) {
+     NODAL_ACCUMULATION_3D_BODY_INDEX;
+     NODAL_ACCUMULATION_3D_RAJA_ATOMIC_BODY(RAJA::hip_atomic);
+   }
+}
+
+
+template < size_t block_size >
+void NODAL_ACCUMULATION_3D::runHipVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = m_domain->n_real_zones;
+
+  NODAL_ACCUMULATION_3D_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    NODAL_ACCUMULATION_3D_DATA_SETUP_HIP;
+
+    NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+
+      hipLaunchKernelGGL((nodal_accumulation_3d<block_size>), dim3(grid_size), dim3(block_size), 0, 0, vol,
+                                       x0, x1, x2, x3, x4, x5, x6, x7,
+                                       real_zones,
+                                       ibegin, iend);
+      hipErrchk( hipGetLastError() );
+
+    }
+    stopTimer();
+
+    NODAL_ACCUMULATION_3D_DATA_TEARDOWN_HIP;
+
+  } else if ( vid == RAJA_HIP ) {
+
+    NODAL_ACCUMULATION_3D_DATA_SETUP_HIP;
+
+    NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
+
+    camp::resources::Resource working_res{camp::resources::Hip()};
+    RAJA::TypedListSegment<Index_type> zones(m_domain->real_zones,
+                                             m_domain->n_real_zones,
+                                             working_res);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >(
+        zones, [=] __device__ (Index_type i) {
+          NODAL_ACCUMULATION_3D_RAJA_ATOMIC_BODY(RAJA::hip_atomic);
+      });
+
+    }
+    stopTimer();
+
+    NODAL_ACCUMULATION_3D_DATA_TEARDOWN_HIP;
+
+  } else {
+     std::cout << "\n  NODAL_ACCUMULATION_3D : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(NODAL_ACCUMULATION_3D, Hip)
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_HIP
diff --git a/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp b/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp
new file mode 100644
index 000000000..baaf60664
--- /dev/null
+++ b/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp
@@ -0,0 +1,147 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "NODAL_ACCUMULATION_3D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "AppsData.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace apps
+{
+
+
+void NODAL_ACCUMULATION_3D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = m_domain->n_real_zones;
+
+  NODAL_ACCUMULATION_3D_DATA_SETUP;
+
+  NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
+
+
+  switch ( vid ) {
+
+    case Base_OpenMP : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        #pragma omp parallel for
+        for (Index_type ii = ibegin ; ii < iend ; ++ii ) {
+          NODAL_ACCUMULATION_3D_BODY_INDEX;
+
+          Real_type val = 0.125 * vol[i];
+
+          #pragma omp atomic
+          x0[i] += val;
+          #pragma omp atomic
+          x1[i] += val;
+          #pragma omp atomic
+          x2[i] += val;
+          #pragma omp atomic
+          x3[i] += val;
+          #pragma omp atomic
+          x4[i] += val;
+          #pragma omp atomic
+          x5[i] += val;
+          #pragma omp atomic
+          x6[i] += val;
+          #pragma omp atomic
+          x7[i] += val;
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case Lambda_OpenMP : {
+
+      auto nodal_accumulation_3d_lam = [=](Index_type ii) {
+            NODAL_ACCUMULATION_3D_BODY_INDEX;
+
+            Real_type val = 0.125 * vol[i];
+
+            #pragma omp atomic
+            x0[i] += val;
+            #pragma omp atomic
+            x1[i] += val;
+            #pragma omp atomic
+            x2[i] += val;
+            #pragma omp atomic
+            x3[i] += val;
+            #pragma omp atomic
+            x4[i] += val;
+            #pragma omp atomic
+            x5[i] += val;
+            #pragma omp atomic
+            x6[i] += val;
+            #pragma omp atomic
+            x7[i] += val;
+          };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        #pragma omp parallel for
+        for (Index_type ii = ibegin ; ii < iend ; ++ii ) {
+          nodal_accumulation_3d_lam(ii);
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_OpenMP : {
+
+      camp::resources::Resource working_res{camp::resources::Host()};
+      RAJA::TypedListSegment<Index_type> zones(m_domain->real_zones,
+                                               m_domain->n_real_zones,
+                                               working_res);
+
+      auto nodal_accumulation_3d_lam = [=](Index_type i) {
+                                         NODAL_ACCUMULATION_3D_RAJA_ATOMIC_BODY(RAJA::omp_atomic);
+                                       };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          zones, nodal_accumulation_3d_lam);
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    default : {
+      std::cout << "\n  NODAL_ACCUMULATION_3D : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+} // end namespace apps
+} // end namespace rajaperf
diff --git a/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp b/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp
new file mode 100644
index 000000000..f19189c64
--- /dev/null
+++ b/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp
@@ -0,0 +1,126 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "NODAL_ACCUMULATION_3D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include "AppsData.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace apps
+{
+
+  //
+  // Define threads per team for target execution
+  //
+  const size_t threads_per_team = 256;
+
+#define NODAL_ACCUMULATION_3D_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  allocAndInitOpenMPDeviceData(x, m_x, m_nodal_array_length, did, hid); \
+  allocAndInitOpenMPDeviceData(vol, m_vol, m_zonal_array_length, did, hid); \
+  allocAndInitOpenMPDeviceData(real_zones, m_domain->real_zones, iend, did, hid);
+
+#define NODAL_ACCUMULATION_3D_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_x, x, m_nodal_array_length, hid, did); \
+  deallocOpenMPDeviceData(x, did); \
+  deallocOpenMPDeviceData(vol, did); \
+  deallocOpenMPDeviceData(real_zones, did);
+
+
+void NODAL_ACCUMULATION_3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = m_domain->n_real_zones;
+
+  NODAL_ACCUMULATION_3D_DATA_SETUP;
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    NODAL_ACCUMULATION_3D_DATA_SETUP_OMP_TARGET;
+
+    NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(x0,x1,x2,x3,x4,x5,x6,x7, \
+                                       vol, real_zones) device( did )
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
+      for (Index_type ii = ibegin ; ii < iend ; ++ii ) {
+        NODAL_ACCUMULATION_3D_BODY_INDEX;
+
+        Real_type val = 0.125 * vol[i];
+
+        #pragma omp atomic
+        x0[i] += val;
+        #pragma omp atomic
+        x1[i] += val;
+        #pragma omp atomic
+        x2[i] += val;
+        #pragma omp atomic
+        x3[i] += val;
+        #pragma omp atomic
+        x4[i] += val;
+        #pragma omp atomic
+        x5[i] += val;
+        #pragma omp atomic
+        x6[i] += val;
+        #pragma omp atomic
+        x7[i] += val;
+      }
+
+    }
+    stopTimer();
+
+    NODAL_ACCUMULATION_3D_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    NODAL_ACCUMULATION_3D_DATA_SETUP_OMP_TARGET;
+
+    NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
+
+    camp::resources::Resource working_res{camp::resources::Omp()};
+    RAJA::TypedListSegment<Index_type> zones(m_domain->real_zones,
+                                             m_domain->n_real_zones,
+                                             working_res);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+        zones, [=](Index_type i) {
+        NODAL_ACCUMULATION_3D_RAJA_ATOMIC_BODY(RAJA::omp_atomic);
+      });
+
+    }
+    stopTimer();
+
+    NODAL_ACCUMULATION_3D_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+    std::cout << "\n  NODAL_ACCUMULATION_3D : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp b/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp
new file mode 100644
index 000000000..61449d0f6
--- /dev/null
+++ b/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp
@@ -0,0 +1,104 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "NODAL_ACCUMULATION_3D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "AppsData.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace apps
+{
+
+
+void NODAL_ACCUMULATION_3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = m_domain->n_real_zones;
+
+  NODAL_ACCUMULATION_3D_DATA_SETUP;
+
+  NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
+
+  switch ( vid ) {
+
+    case Base_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type ii = ibegin ; ii < iend ; ++ii ) {
+          NODAL_ACCUMULATION_3D_BODY_INDEX;
+          NODAL_ACCUMULATION_3D_BODY;
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+#if defined(RUN_RAJA_SEQ)
+    case Lambda_Seq : {
+
+      auto nodal_accumulation_3d_lam = [=](Index_type ii) {
+                         NODAL_ACCUMULATION_3D_BODY_INDEX;
+                         NODAL_ACCUMULATION_3D_BODY;
+                       };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type ii = ibegin ; ii < iend ; ++ii ) {
+          nodal_accumulation_3d_lam(ii);
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_Seq : {
+
+      camp::resources::Resource working_res{camp::resources::Host()};
+      RAJA::TypedListSegment<Index_type> zones(m_domain->real_zones,
+                                               m_domain->n_real_zones,
+                                               working_res);
+
+      auto nodal_accumulation_3d_lam = [=](Index_type i) {
+                         NODAL_ACCUMULATION_3D_RAJA_ATOMIC_BODY(RAJA::seq_atomic);
+                       };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::forall<RAJA::loop_exec>(zones, nodal_accumulation_3d_lam);
+
+      }
+      stopTimer();
+
+      break;
+    }
+#endif // RUN_RAJA_SEQ
+
+    default : {
+      std::cout << "\n  NODAL_ACCUMULATION_3D : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+}
+
+} // end namespace apps
+} // end namespace rajaperf
diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp
new file mode 100644
index 000000000..5fd512fb7
--- /dev/null
+++ b/src/apps/NODAL_ACCUMULATION_3D.cpp
@@ -0,0 +1,97 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "NODAL_ACCUMULATION_3D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "AppsData.hpp"
+#include "common/DataUtils.hpp"
+
+#include <cmath>
+
+
+namespace rajaperf
+{
+namespace apps
+{
+
+
+NODAL_ACCUMULATION_3D::NODAL_ACCUMULATION_3D(const RunParams& params)
+  : KernelBase(rajaperf::Apps_NODAL_ACCUMULATION_3D, params)
+{
+  setDefaultProblemSize(100*100*100);  // See rzmax in ADomain struct
+  setDefaultReps(100);
+
+  Index_type rzmax = std::cbrt(getTargetProblemSize())+1;
+  m_domain = new ADomain(rzmax, /* ndims = */ 3);
+
+  m_nodal_array_length = m_domain->nnalls;
+  m_zonal_array_length = m_domain->lpz+1;
+
+  setActualProblemSize( m_domain->n_real_zones );
+
+  setItsPerRep( getActualProblemSize() );
+  setKernelsPerRep(1);
+  // touched data size, not actual number of stores and loads
+  setBytesPerRep( (0*sizeof(Index_type) + 1*sizeof(Index_type)) * getItsPerRep() +
+                  (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() +
+                  (1*sizeof(Real_type) + 1*sizeof(Real_type)) * (m_domain->imax+1 - m_domain->imin)*(m_domain->jmax+1 - m_domain->jmin)*(m_domain->kmax+1 - m_domain->kmin));
+  setFLOPsPerRep(9 * getItsPerRep());
+
+  checksum_scale_factor = 0.001 *
+              ( static_cast<Checksum_type>(getDefaultProblemSize()) /
+                                           getActualProblemSize() );
+
+  setUsesFeature(Forall);
+  setUsesFeature(Atomic);
+
+  setVariantDefined( Base_Seq );
+  setVariantDefined( Lambda_Seq );
+  setVariantDefined( RAJA_Seq );
+
+  setVariantDefined( Base_OpenMP );
+  setVariantDefined( Lambda_OpenMP );
+  setVariantDefined( RAJA_OpenMP );
+
+  setVariantDefined( Base_OpenMPTarget );
+  setVariantDefined( RAJA_OpenMPTarget );
+
+  setVariantDefined( Base_CUDA );
+  setVariantDefined( RAJA_CUDA );
+
+  setVariantDefined( Base_HIP );
+  setVariantDefined( RAJA_HIP );
+}
+
+NODAL_ACCUMULATION_3D::~NODAL_ACCUMULATION_3D()
+{
+  delete m_domain;
+}
+
+void NODAL_ACCUMULATION_3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  allocAndInitDataConst(m_x, m_nodal_array_length, 0.0, vid);
+  allocAndInitDataConst(m_vol, m_zonal_array_length, 1.0, vid);
+}
+
+void NODAL_ACCUMULATION_3D::updateChecksum(VariantID vid, size_t tune_idx)
+{
+  checksum[vid].at(tune_idx) += calcChecksum(m_x, m_nodal_array_length, checksum_scale_factor );
+}
+
+void NODAL_ACCUMULATION_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  (void) vid;
+
+  deallocData(m_x);
+  deallocData(m_vol);
+}
+
+} // end namespace apps
+} // end namespace rajaperf
diff --git a/src/apps/NODAL_ACCUMULATION_3D.hpp b/src/apps/NODAL_ACCUMULATION_3D.hpp
new file mode 100644
index 000000000..a574f331a
--- /dev/null
+++ b/src/apps/NODAL_ACCUMULATION_3D.hpp
@@ -0,0 +1,121 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// NODAL_ACCUMULATION_3D kernel reference implementation:
+///
+/// NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
+///
+/// for (Index_type ii = ibegin; ii < iend; ++ii ) {
+///   Index_type i = real_zones[ii];
+///
+///   Real_type val = 0.125 * vol[i] ;
+///
+///   x0[i] += val;
+///   x1[i] += val;
+///   x2[i] += val;
+///   x3[i] += val;
+///   x4[i] += val;
+///   x5[i] += val;
+///   x6[i] += val;
+///   x7[i] += val;
+///
+/// }
+///
+
+#ifndef RAJAPerf_Apps_NODAL_ACCUMULATION_3D_HPP
+#define RAJAPerf_Apps_NODAL_ACCUMULATION_3D_HPP
+
+#define NODAL_ACCUMULATION_3D_DATA_SETUP \
+  Real_ptr x = m_x; \
+  Real_ptr vol = m_vol; \
+  \
+  Real_ptr x0,x1,x2,x3,x4,x5,x6,x7; \
+  \
+  Index_ptr real_zones = m_domain->real_zones;
+
+#define NODAL_ACCUMULATION_3D_BODY_INDEX \
+  Index_type i = real_zones[ii];
+
+#define NODAL_ACCUMULATION_3D_BODY \
+  Real_type val = 0.125 * vol[i]; \
+  \
+  x0[i] += val; \
+  x1[i] += val; \
+  x2[i] += val; \
+  x3[i] += val; \
+  x4[i] += val; \
+  x5[i] += val; \
+  x6[i] += val; \
+  x7[i] += val;
+
+#define NODAL_ACCUMULATION_3D_RAJA_ATOMIC_BODY(policy) \
+  Real_type val = 0.125 * vol[i]; \
+  \
+  RAJA::atomicAdd<policy>(&x0[i], val); \
+  RAJA::atomicAdd<policy>(&x1[i], val); \
+  RAJA::atomicAdd<policy>(&x2[i], val); \
+  RAJA::atomicAdd<policy>(&x3[i], val); \
+  RAJA::atomicAdd<policy>(&x4[i], val); \
+  RAJA::atomicAdd<policy>(&x5[i], val); \
+  RAJA::atomicAdd<policy>(&x6[i], val); \
+  RAJA::atomicAdd<policy>(&x7[i], val);
+
+
+
+#include "common/KernelBase.hpp"
+
+namespace rajaperf
+{
+class RunParams;
+
+namespace apps
+{
+class ADomain;
+
+class NODAL_ACCUMULATION_3D : public KernelBase
+{
+public:
+
+  NODAL_ACCUMULATION_3D(const RunParams& params);
+
+  ~NODAL_ACCUMULATION_3D();
+
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
+
+private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
+  Real_ptr m_x;
+  Real_ptr m_vol;
+
+  ADomain* m_domain;
+  Index_type m_nodal_array_length;
+  Index_type m_zonal_array_length;
+};
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard
diff --git a/src/apps/PRESSURE-Cuda.cpp b/src/apps/PRESSURE-Cuda.cpp
index b0d5ab615..14ad2ae34 100644
--- a/src/apps/PRESSURE-Cuda.cpp
+++ b/src/apps/PRESSURE-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace apps
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define PRESSURE_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(compression, m_compression, iend); \
   allocAndInitCudaDeviceData(bvc, m_bvc, iend); \
@@ -42,30 +36,35 @@ namespace apps
   deallocCudaDeviceData(e_old); \
   deallocCudaDeviceData(vnewc);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void pressurecalc1(Real_ptr bvc, Real_ptr compression,
                               const Real_type cls,
                               Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      PRESSURE_BODY1;
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void pressurecalc2(Real_ptr p_new, Real_ptr bvc, Real_ptr e_old,
                               Real_ptr vnewc,
                               const Real_type p_cut, const Real_type eosvmax,
                               const Real_type pmin,
                               Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      PRESSURE_BODY2;
    }
 }
 
 
-void PRESSURE::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void PRESSURE::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -82,12 +81,12 @@ void PRESSURE::runCudaVariant(VariantID vid)
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
 
-       pressurecalc1<<<grid_size, block_size>>>( bvc, compression,
+       pressurecalc1<block_size><<<grid_size, block_size>>>( bvc, compression,
                                                  cls,
                                                  iend );
        cudaErrchk( cudaGetLastError() );
 
-       pressurecalc2<<<grid_size, block_size>>>( p_new, bvc, e_old,
+       pressurecalc2<block_size><<<grid_size, block_size>>>( p_new, bvc, e_old,
                                                  vnewc,
                                                  p_cut, eosvmax, pmin,
                                                  iend );
@@ -133,10 +132,12 @@ void PRESSURE::runCudaVariant(VariantID vid)
     PRESSURE_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  PRESSURE : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  PRESSURE : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(PRESSURE, Cuda)
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/PRESSURE-Hip.cpp b/src/apps/PRESSURE-Hip.cpp
index 646fbc703..03c9e04fb 100644
--- a/src/apps/PRESSURE-Hip.cpp
+++ b/src/apps/PRESSURE-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace apps
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define PRESSURE_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(compression, m_compression, iend); \
   allocAndInitHipDeviceData(bvc, m_bvc, iend); \
@@ -42,30 +36,35 @@ namespace apps
   deallocHipDeviceData(e_old); \
   deallocHipDeviceData(vnewc);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void pressurecalc1(Real_ptr bvc, Real_ptr compression,
                               const Real_type cls,
                               Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      PRESSURE_BODY1;
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void pressurecalc2(Real_ptr p_new, Real_ptr bvc, Real_ptr e_old,
                               Real_ptr vnewc,
                               const Real_type p_cut, const Real_type eosvmax,
                               const Real_type pmin,
                               Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      PRESSURE_BODY2;
    }
 }
 
 
-void PRESSURE::runHipVariant(VariantID vid)
+template < size_t block_size >
+void PRESSURE::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -82,12 +81,12 @@ void PRESSURE::runHipVariant(VariantID vid)
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
 
-       hipLaunchKernelGGL((pressurecalc1), dim3(grid_size), dim3(block_size), 0, 0,  bvc, compression,
+       hipLaunchKernelGGL((pressurecalc1<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  bvc, compression,
                                                  cls,
                                                  iend );
        hipErrchk( hipGetLastError() );
 
-       hipLaunchKernelGGL((pressurecalc2), dim3(grid_size), dim3(block_size), 0, 0,  p_new, bvc, e_old,
+       hipLaunchKernelGGL((pressurecalc2<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  p_new, bvc, e_old,
                                                  vnewc,
                                                  p_cut, eosvmax, pmin,
                                                  iend );
@@ -126,10 +125,12 @@ void PRESSURE::runHipVariant(VariantID vid)
     PRESSURE_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  PRESSURE : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  PRESSURE : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(PRESSURE, Hip)
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/PRESSURE-OMP.cpp b/src/apps/PRESSURE-OMP.cpp
index 0d7182dfd..867e72586 100644
--- a/src/apps/PRESSURE-OMP.cpp
+++ b/src/apps/PRESSURE-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
 
 
-void PRESSURE::runOpenMPVariant(VariantID vid)
+void PRESSURE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -34,7 +34,7 @@ void PRESSURE::runOpenMPVariant(VariantID vid)
   auto pressure_lam2 = [=](Index_type i) {
                          PRESSURE_BODY2;
                        };
-  
+
   switch ( vid ) {
 
     case Base_OpenMP : {
@@ -111,12 +111,12 @@ void PRESSURE::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  PRESSURE : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  PRESSURE : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/apps/PRESSURE-OMPTarget.cpp b/src/apps/PRESSURE-OMPTarget.cpp
index 643302b46..8c25f44c7 100644
--- a/src/apps/PRESSURE-OMPTarget.cpp
+++ b/src/apps/PRESSURE-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
@@ -45,14 +45,14 @@ namespace apps
   deallocOpenMPDeviceData(vnewc, did);
 
 
-void PRESSURE::runOpenMPTargetVariant(VariantID vid)
+void PRESSURE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
-  
+
   PRESSURE_DATA_SETUP;
-  
+
   if ( vid == Base_OpenMPTarget ) {
 
     PRESSURE_DATA_SETUP_OMP_TARGET;
@@ -61,13 +61,13 @@ void PRESSURE::runOpenMPTargetVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       #pragma omp target is_device_ptr(compression, bvc) device( did )
-      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) 
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
       for (Index_type i = ibegin; i < iend; ++i ) {
         PRESSURE_BODY1;
       }
 
       #pragma omp target is_device_ptr(bvc, p_new, e_old, vnewc) device( did )
-      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) 
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
       for (Index_type i = ibegin; i < iend; ++i ) {
         PRESSURE_BODY2;
       }
@@ -104,7 +104,7 @@ void PRESSURE::runOpenMPTargetVariant(VariantID vid)
     PRESSURE_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-    std::cout << "\n  PRESSURE : Unknown OMP Target variant id = " << vid << std::endl;
+    getCout() << "\n  PRESSURE : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/apps/PRESSURE-Seq.cpp b/src/apps/PRESSURE-Seq.cpp
index 77ce1200e..c2f79e977 100644
--- a/src/apps/PRESSURE-Seq.cpp
+++ b/src/apps/PRESSURE-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
 
 
-void PRESSURE::runSeqVariant(VariantID vid)
+void PRESSURE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -32,7 +32,7 @@ void PRESSURE::runSeqVariant(VariantID vid)
   auto pressure_lam2 = [=](Index_type i) {
                          PRESSURE_BODY2;
                        };
-  
+
   switch ( vid ) {
 
     case Base_Seq : {
@@ -52,7 +52,7 @@ void PRESSURE::runSeqVariant(VariantID vid)
       stopTimer();
 
       break;
-    } 
+    }
 
 #if defined(RUN_RAJA_SEQ)
     case Lambda_Seq : {
@@ -90,14 +90,14 @@ void PRESSURE::runSeqVariant(VariantID vid)
         }); // end sequential region (for single-source code)
 
       }
-      stopTimer(); 
+      stopTimer();
 
       break;
     }
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  PRESSURE : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  PRESSURE : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp
index b4ef1d72c..df2cb744f 100644
--- a/src/apps/PRESSURE.cpp
+++ b/src/apps/PRESSURE.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -58,7 +58,7 @@ PRESSURE::~PRESSURE()
 {
 }
 
-void PRESSURE::setUp(VariantID vid)
+void PRESSURE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitData(m_compression, getActualProblemSize(), vid);
   allocAndInitData(m_bvc, getActualProblemSize(), vid);
@@ -72,12 +72,12 @@ void PRESSURE::setUp(VariantID vid)
   initData(m_eosvmax);
 }
 
-void PRESSURE::updateChecksum(VariantID vid)
+void PRESSURE::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_p_new, getActualProblemSize());
+  checksum[vid][tune_idx] += calcChecksum(m_p_new, getActualProblemSize());
 }
 
-void PRESSURE::tearDown(VariantID vid)
+void PRESSURE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
 
diff --git a/src/apps/PRESSURE.hpp b/src/apps/PRESSURE.hpp
index 44c6602fa..6421ce6b0 100644
--- a/src/apps/PRESSURE.hpp
+++ b/src/apps/PRESSURE.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -63,17 +63,27 @@ class PRESSURE : public KernelBase
 
   ~PRESSURE();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_compression;
   Real_ptr m_bvc;
   Real_ptr m_p_new;
diff --git a/src/apps/VOL3D-Cuda.cpp b/src/apps/VOL3D-Cuda.cpp
index 79db31282..3f65c1b8a 100644
--- a/src/apps/VOL3D-Cuda.cpp
+++ b/src/apps/VOL3D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -23,12 +23,6 @@ namespace rajaperf
 namespace apps
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define VOL3D_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(x, m_x, m_array_length); \
   allocAndInitCudaDeviceData(y, m_y, m_array_length); \
@@ -42,6 +36,8 @@ namespace apps
   deallocCudaDeviceData(z); \
   deallocCudaDeviceData(vol);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void vol3d(Real_ptr vol,
                       const Real_ptr x0, const Real_ptr x1,
                       const Real_ptr x2, const Real_ptr x3,
@@ -58,7 +54,7 @@ __global__ void vol3d(Real_ptr vol,
                       const Real_type vnormq,
                       Index_type ibegin, Index_type iend)
 {
-   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type ii = blockIdx.x * block_size + threadIdx.x;
    Index_type i = ii + ibegin;
    if (i < iend) {
      VOL3D_BODY;
@@ -66,7 +62,8 @@ __global__ void vol3d(Real_ptr vol,
 }
 
 
-void VOL3D::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void VOL3D::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = m_domain->fpz;
@@ -87,7 +84,7 @@ void VOL3D::runCudaVariant(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
 
-      vol3d<<<grid_size, block_size>>>(vol,
+      vol3d<block_size><<<grid_size, block_size>>>(vol,
                                        x0, x1, x2, x3, x4, x5, x6, x7,
                                        y0, y1, y2, y3, y4, y5, y6, y7,
                                        z0, z1, z2, z3, z4, z5, z6, z7,
@@ -122,10 +119,12 @@ void VOL3D::runCudaVariant(VariantID vid)
     VOL3D_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  VOL3D : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  VOL3D : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(VOL3D, Cuda)
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/VOL3D-Hip.cpp b/src/apps/VOL3D-Hip.cpp
index 978c794ce..70f121e09 100644
--- a/src/apps/VOL3D-Hip.cpp
+++ b/src/apps/VOL3D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -23,12 +23,6 @@ namespace rajaperf
 namespace apps
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define VOL3D_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(x, m_x, m_array_length); \
   allocAndInitHipDeviceData(y, m_y, m_array_length); \
@@ -42,6 +36,8 @@ namespace apps
   deallocHipDeviceData(z); \
   deallocHipDeviceData(vol);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void vol3d(Real_ptr vol,
                       const Real_ptr x0, const Real_ptr x1,
                       const Real_ptr x2, const Real_ptr x3,
@@ -58,7 +54,7 @@ __global__ void vol3d(Real_ptr vol,
                       const Real_type vnormq,
                       Index_type ibegin, Index_type iend)
 {
-   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type ii = blockIdx.x * block_size + threadIdx.x;
    Index_type i = ii + ibegin;
    if (i < iend) {
      VOL3D_BODY;
@@ -66,7 +62,8 @@ __global__ void vol3d(Real_ptr vol,
 }
 
 
-void VOL3D::runHipVariant(VariantID vid)
+template < size_t block_size >
+void VOL3D::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = m_domain->fpz;
@@ -87,7 +84,7 @@ void VOL3D::runHipVariant(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
 
-      hipLaunchKernelGGL((vol3d), dim3(grid_size), dim3(block_size), 0, 0, vol,
+      hipLaunchKernelGGL((vol3d<block_size>), dim3(grid_size), dim3(block_size), 0, 0, vol,
                                        x0, x1, x2, x3, x4, x5, x6, x7,
                                        y0, y1, y2, y3, y4, y5, y6, y7,
                                        z0, z1, z2, z3, z4, z5, z6, z7,
@@ -122,10 +119,12 @@ void VOL3D::runHipVariant(VariantID vid)
     VOL3D_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  VOL3D : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  VOL3D : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(VOL3D, Hip)
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/VOL3D-OMP.cpp b/src/apps/VOL3D-OMP.cpp
index 90b84f857..0f773876c 100644
--- a/src/apps/VOL3D-OMP.cpp
+++ b/src/apps/VOL3D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -14,13 +14,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
 
 
-void VOL3D::runOpenMPVariant(VariantID vid)
+void VOL3D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -45,7 +45,7 @@ void VOL3D::runOpenMPVariant(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        #pragma omp parallel for 
+        #pragma omp parallel for
         for (Index_type i = ibegin ; i < iend ; ++i ) {
           VOL3D_BODY;
         }
@@ -87,12 +87,12 @@ void VOL3D::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  VOL3D : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  VOL3D : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/apps/VOL3D-OMPTarget.cpp b/src/apps/VOL3D-OMPTarget.cpp
index 6a8de52c8..75d8fb2b0 100644
--- a/src/apps/VOL3D-OMPTarget.cpp
+++ b/src/apps/VOL3D-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,7 +18,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
@@ -45,7 +45,7 @@ namespace apps
   deallocOpenMPDeviceData(vol, did);
 
 
-void VOL3D::runOpenMPTargetVariant(VariantID vid)
+void VOL3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = m_domain->fpz;
@@ -68,7 +68,7 @@ void VOL3D::runOpenMPTargetVariant(VariantID vid)
                                        y0,y1,y2,y3,y4,y5,y6,y7, \
                                        z0,z1,z2,z3,z4,z5,z6,z7, \
                                        vol) device( did )
-      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) 
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
       for (Index_type i = ibegin ; i < iend ; ++i ) {
         VOL3D_BODY;
       }
@@ -76,7 +76,7 @@ void VOL3D::runOpenMPTargetVariant(VariantID vid)
     }
     stopTimer();
 
-    VOL3D_DATA_TEARDOWN_OMP_TARGET;     
+    VOL3D_DATA_TEARDOWN_OMP_TARGET;
 
   } else if ( vid == RAJA_OpenMPTarget ) {
 
@@ -98,10 +98,10 @@ void VOL3D::runOpenMPTargetVariant(VariantID vid)
     }
     stopTimer();
 
-    VOL3D_DATA_TEARDOWN_OMP_TARGET;     
+    VOL3D_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-    std::cout << "\n  VOL3D : Unknown OMP Target variant id = " << vid << std::endl;
+    getCout() << "\n  VOL3D : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/apps/VOL3D-Seq.cpp b/src/apps/VOL3D-Seq.cpp
index e748c3ffd..bb4227280 100644
--- a/src/apps/VOL3D-Seq.cpp
+++ b/src/apps/VOL3D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -14,13 +14,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace apps
 {
 
 
-void VOL3D::runSeqVariant(VariantID vid)
+void VOL3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = m_domain->fpz;
@@ -51,7 +51,7 @@ void VOL3D::runSeqVariant(VariantID vid)
       stopTimer();
 
       break;
-    } 
+    }
 
 #if defined(RUN_RAJA_SEQ)
     case Lambda_Seq : {
@@ -78,14 +78,14 @@ void VOL3D::runSeqVariant(VariantID vid)
           RAJA::RangeSegment(ibegin, iend), vol3d_lam);
 
       }
-      stopTimer(); 
+      stopTimer();
 
       break;
     }
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  VOL3D : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  VOL3D : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp
index a8ac3bbc6..fd2ebb5aa 100644
--- a/src/apps/VOL3D.cpp
+++ b/src/apps/VOL3D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -71,7 +71,7 @@ VOL3D::~VOL3D()
   delete m_domain;
 }
 
-void VOL3D::setUp(VariantID vid)
+void VOL3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_x, m_array_length, 0.0, vid);
   allocAndInitDataConst(m_y, m_array_length, 0.0, vid);
@@ -87,12 +87,12 @@ void VOL3D::setUp(VariantID vid)
   m_vnormq = 0.083333333333333333; /* vnormq = 1/12 */
 }
 
-void VOL3D::updateChecksum(VariantID vid)
+void VOL3D::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_vol, m_array_length, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_vol, m_array_length, checksum_scale_factor );
 }
 
-void VOL3D::tearDown(VariantID vid)
+void VOL3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
 
diff --git a/src/apps/VOL3D.hpp b/src/apps/VOL3D.hpp
index 6faf02523..9ddedbd19 100644
--- a/src/apps/VOL3D.hpp
+++ b/src/apps/VOL3D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -160,17 +160,27 @@ class VOL3D : public KernelBase
 
   ~VOL3D();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_x;
   Real_ptr m_y;
   Real_ptr m_z;
diff --git a/src/apps/WIP-COUPLE.cpp b/src/apps/WIP-COUPLE.cpp
index 51bb4fa2f..0f25f5ee0 100644
--- a/src/apps/WIP-COUPLE.cpp
+++ b/src/apps/WIP-COUPLE.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -58,7 +58,7 @@ COUPLE::~COUPLE()
   delete m_domain;
 }
 
-void COUPLE::setUp(VariantID vid)
+void COUPLE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   Index_type max_loop_index = m_domain->lrn;
 
@@ -80,8 +80,9 @@ void COUPLE::setUp(VariantID vid)
   m_ireal = Complex_type(0.0, 1.0);
 }
 
-void COUPLE::runKernel(VariantID vid)
+void COUPLE::runKernel(VariantID vid, size_t tune_idx)
 {
+  RAJA_UNUSED_VAR(tune_idx);
   const Index_type run_reps = getRunReps();
 
   COUPLE_DATA_SETUP;
@@ -158,7 +159,7 @@ void COUPLE::runKernel(VariantID vid)
     case Base_OpenMPTarget :
     case RAJA_OpenMPTarget :
     {
-      runOpenMPTargetVariant(vid);
+      runOpenMPTargetVariant(vid, tune_idx);
       break;
     }
 #endif
@@ -167,28 +168,28 @@ void COUPLE::runKernel(VariantID vid)
     case Base_CUDA :
     case RAJA_CUDA :
     {
-      runCudaVariant(vid);
+      runCudaVariant(vid, tune_idx);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  COUPLE : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  COUPLE : Unknown variant id = " << vid << std::endl;
     }
 
   }
 }
 
-void COUPLE::updateChecksum(VariantID vid)
+void COUPLE::updateChecksum(VariantID vid, size_t tune_idx)
 {
   Index_type max_loop_index = m_domain->lrn;
 
-  checksum[vid] += calcChecksum(m_t0, max_loop_index);
-  checksum[vid] += calcChecksum(m_t1, max_loop_index);
-  checksum[vid] += calcChecksum(m_t2, max_loop_index);
+  checksum[vid][tune_idx] += calcChecksum(m_t0, max_loop_index);
+  checksum[vid][tune_idx] += calcChecksum(m_t1, max_loop_index);
+  checksum[vid][tune_idx] += calcChecksum(m_t2, max_loop_index);
 }
 
-void COUPLE::tearDown(VariantID vid)
+void COUPLE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
 
diff --git a/src/apps/WIP-COUPLE.hpp b/src/apps/WIP-COUPLE.hpp
index e5040ea57..cdafcd5eb 100644
--- a/src/apps/WIP-COUPLE.hpp
+++ b/src/apps/WIP-COUPLE.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -161,16 +161,16 @@ class COUPLE : public KernelBase
 
   ~COUPLE();
 
-  void setUp(VariantID vid);
-  void runKernel(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void runKernel(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid) {(void) vid;}
-  void runOpenMPVariant(VariantID vid) {(void) vid;}
-  void runCudaVariant(VariantID vid) {(void) vid;}
-  void runHipVariant(VariantID vid) {(void) vid;}
-  void runOpenMPTargetVariant(VariantID vid) {(void) vid;}
+  void runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {(void) vid;}
+  void runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {(void) vid;}
+  void runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {(void) vid;}
+  void runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {(void) vid;}
+  void runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {(void) vid;}
 
 private:
   Complex_ptr m_t0;
diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt
index 250529814..ceeb1a502 100644
--- a/src/basic/CMakeLists.txt
+++ b/src/basic/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
@@ -14,12 +14,30 @@ blt_add_library(
           DAXPY-Cuda.cpp
           DAXPY-OMP.cpp
           DAXPY-OMPTarget.cpp
+          DAXPY_ATOMIC.cpp
+          DAXPY_ATOMIC-Seq.cpp
+          DAXPY_ATOMIC-Hip.cpp
+          DAXPY_ATOMIC-Cuda.cpp
+          DAXPY_ATOMIC-OMP.cpp
+          DAXPY_ATOMIC-OMPTarget.cpp
           IF_QUAD.cpp
           IF_QUAD-Seq.cpp
           IF_QUAD-Hip.cpp
           IF_QUAD-Cuda.cpp
           IF_QUAD-OMP.cpp
           IF_QUAD-OMPTarget.cpp
+          INDEXLIST.cpp
+          INDEXLIST-Seq.cpp
+          INDEXLIST-Hip.cpp
+          INDEXLIST-Cuda.cpp
+          INDEXLIST-OMP.cpp
+          INDEXLIST-OMPTarget.cpp
+          INDEXLIST_3LOOP.cpp
+          INDEXLIST_3LOOP-Seq.cpp
+          INDEXLIST_3LOOP-Hip.cpp
+          INDEXLIST_3LOOP-Cuda.cpp
+          INDEXLIST_3LOOP-OMP.cpp
+          INDEXLIST_3LOOP-OMPTarget.cpp
           INIT3.cpp
           INIT3-Seq.cpp
           INIT3-Hip.cpp
@@ -74,6 +92,12 @@ blt_add_library(
           REDUCE3_INT-Cuda.cpp
           REDUCE3_INT-OMP.cpp
           REDUCE3_INT-OMPTarget.cpp
+          REDUCE_STRUCT.cpp
+          REDUCE_STRUCT-Seq.cpp
+          REDUCE_STRUCT-Hip.cpp
+          REDUCE_STRUCT-Cuda.cpp
+          REDUCE_STRUCT-OMP.cpp
+          REDUCE_STRUCT-OMPTarget.cpp
           TRAP_INT.cpp
           TRAP_INT-Seq.cpp
           TRAP_INT-Hip.cpp
diff --git a/src/basic/DAXPY-Cuda.cpp b/src/basic/DAXPY-Cuda.cpp
index 7e4f52fed..a87421c4f 100644
--- a/src/basic/DAXPY-Cuda.cpp
+++ b/src/basic/DAXPY-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace basic
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define DAXPY_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(x, m_x, iend); \
   allocAndInitCudaDeviceData(y, m_y, iend);
@@ -36,17 +30,21 @@ namespace basic
   deallocCudaDeviceData(x); \
   deallocCudaDeviceData(y);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void daxpy(Real_ptr y, Real_ptr x,
                       Real_type a,
                       Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      DAXPY_BODY;
    }
 }
 
-void DAXPY::runCudaVariant(VariantID vid)
+
+template < size_t block_size >
+void DAXPY::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -62,7 +60,7 @@ void DAXPY::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      daxpy<<<grid_size, block_size>>>( y, x, a,
+      daxpy<block_size><<<grid_size, block_size>>>( y, x, a,
                                         iend );
       cudaErrchk( cudaGetLastError() );
 
@@ -79,7 +77,7 @@ void DAXPY::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      lambda_cuda_forall<<<grid_size, block_size>>>(
+      lambda_cuda_forall<block_size><<<grid_size, block_size>>>(
         ibegin, iend, [=] __device__ (Index_type i) {
         DAXPY_BODY;
       });
@@ -108,10 +106,12 @@ void DAXPY::runCudaVariant(VariantID vid)
     DAXPY_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  DAXPY : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  DAXPY : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DAXPY, Cuda)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp
index 1ed22ef76..25810c19e 100644
--- a/src/basic/DAXPY-Hip.cpp
+++ b/src/basic/DAXPY-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace basic
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define DAXPY_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(x, m_x, iend); \
   allocAndInitHipDeviceData(y, m_y, iend);
@@ -36,18 +30,22 @@ namespace basic
   deallocHipDeviceData(x); \
   deallocHipDeviceData(y);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void daxpy(Real_ptr y, Real_ptr x,
                       Real_type a,
                       Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      DAXPY_BODY;
    }
 }
 
 
-void DAXPY::runHipVariant(VariantID vid)
+
+template < size_t block_size >
+void DAXPY::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -63,7 +61,7 @@ void DAXPY::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((daxpy),dim3(grid_size), dim3(block_size), 0, 0, y, x, a,
+      hipLaunchKernelGGL((daxpy<block_size>),dim3(grid_size), dim3(block_size), 0, 0, y, x, a,
                                         iend );
       hipErrchk( hipGetLastError() );
 
@@ -84,7 +82,7 @@ void DAXPY::runHipVariant(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL(lambda_hip_forall<decltype(daxpy_lambda)>,
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(daxpy_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, daxpy_lambda);
       hipErrchk( hipGetLastError() );
 
@@ -111,10 +109,12 @@ void DAXPY::runHipVariant(VariantID vid)
     DAXPY_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  DAXPY : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  DAXPY : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DAXPY, Hip)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/DAXPY-OMP.cpp b/src/basic/DAXPY-OMP.cpp
index 5a06f5b46..a57e1709d 100644
--- a/src/basic/DAXPY-OMP.cpp
+++ b/src/basic/DAXPY-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
 
 
-void DAXPY::runOpenMPVariant(VariantID vid)
+void DAXPY::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -81,12 +81,12 @@ void DAXPY::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  DAXPY : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  DAXPY : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/basic/DAXPY-OMPTarget.cpp b/src/basic/DAXPY-OMPTarget.cpp
index 286003a5d..a3862d80a 100644
--- a/src/basic/DAXPY-OMPTarget.cpp
+++ b/src/basic/DAXPY-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
@@ -39,7 +39,7 @@ namespace basic
   deallocOpenMPDeviceData(y, did);
 
 
-void DAXPY::runOpenMPTargetVariant(VariantID vid)
+void DAXPY::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -81,9 +81,9 @@ void DAXPY::runOpenMPTargetVariant(VariantID vid)
     stopTimer();
 
     DAXPY_DATA_TEARDOWN_OMP_TARGET;
-  
+
   } else {
-     std::cout << "\n  DAXPY : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  DAXPY : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/basic/DAXPY-Seq.cpp b/src/basic/DAXPY-Seq.cpp
index 325297cd5..3a262561f 100644
--- a/src/basic/DAXPY-Seq.cpp
+++ b/src/basic/DAXPY-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
 
 
-void DAXPY::runSeqVariant(VariantID vid)
+void DAXPY::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -79,7 +79,7 @@ void DAXPY::runSeqVariant(VariantID vid)
 #endif
 
     default : {
-      std::cout << "\n  DAXPY : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  DAXPY : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp
index 16782df2a..6d6133eb6 100644
--- a/src/basic/DAXPY.cpp
+++ b/src/basic/DAXPY.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -57,19 +57,19 @@ DAXPY::~DAXPY()
 {
 }
 
-void DAXPY::setUp(VariantID vid)
+void DAXPY::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_y, getActualProblemSize(), 0.0, vid);
   allocAndInitData(m_x, getActualProblemSize(), vid);
   initData(m_a);
 }
 
-void DAXPY::updateChecksum(VariantID vid)
+void DAXPY::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_y, getActualProblemSize());
+  checksum[vid].at(tune_idx) += calcChecksum(m_y, getActualProblemSize());
 }
 
-void DAXPY::tearDown(VariantID vid)
+void DAXPY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_x);
diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp
index 9f0688d8a..db8501e9f 100644
--- a/src/basic/DAXPY.hpp
+++ b/src/basic/DAXPY.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -43,17 +43,27 @@ class DAXPY : public KernelBase
 
   ~DAXPY();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_x;
   Real_ptr m_y;
   Real_type m_a;
diff --git a/src/basic/DAXPY_ATOMIC-Cuda.cpp b/src/basic/DAXPY_ATOMIC-Cuda.cpp
new file mode 100644
index 000000000..1e8210bd2
--- /dev/null
+++ b/src/basic/DAXPY_ATOMIC-Cuda.cpp
@@ -0,0 +1,118 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "DAXPY_ATOMIC.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+#define DAXPY_ATOMIC_DATA_SETUP_CUDA \
+  allocAndInitCudaDeviceData(x, m_x, iend); \
+  allocAndInitCudaDeviceData(y, m_y, iend);
+
+#define DAXPY_ATOMIC_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_y, y, iend); \
+  deallocCudaDeviceData(x); \
+  deallocCudaDeviceData(y);
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void daxpy_atomic(Real_ptr y, Real_ptr x,
+                      Real_type a,
+                      Index_type iend)
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     DAXPY_ATOMIC_RAJA_BODY(RAJA::cuda_atomic);
+   }
+}
+
+
+template < size_t block_size >
+void DAXPY_ATOMIC::runCudaVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  DAXPY_ATOMIC_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    DAXPY_ATOMIC_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      daxpy_atomic<block_size><<<grid_size, block_size>>>( y, x, a,
+                                        iend );
+      cudaErrchk( cudaGetLastError() );
+
+    }
+    stopTimer();
+
+    DAXPY_ATOMIC_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == Lambda_CUDA ) {
+
+    DAXPY_ATOMIC_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      lambda_cuda_forall<block_size><<<grid_size, block_size>>>(
+        ibegin, iend, [=] __device__ (Index_type i) {
+        DAXPY_ATOMIC_RAJA_BODY(RAJA::cuda_atomic);
+      });
+      cudaErrchk( cudaGetLastError() );
+
+    }
+    stopTimer();
+
+    DAXPY_ATOMIC_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    DAXPY_ATOMIC_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+        RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+        DAXPY_ATOMIC_RAJA_BODY(RAJA::cuda_atomic);
+      });
+
+    }
+    stopTimer();
+
+    DAXPY_ATOMIC_DATA_TEARDOWN_CUDA;
+
+  } else {
+     getCout() << "\n  DAXPY_ATOMIC : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DAXPY_ATOMIC, Cuda)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/basic/DAXPY_ATOMIC-Hip.cpp b/src/basic/DAXPY_ATOMIC-Hip.cpp
new file mode 100644
index 000000000..a1e7a6465
--- /dev/null
+++ b/src/basic/DAXPY_ATOMIC-Hip.cpp
@@ -0,0 +1,120 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "DAXPY_ATOMIC.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#include "common/HipDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+#define DAXPY_ATOMIC_DATA_SETUP_HIP \
+  allocAndInitHipDeviceData(x, m_x, iend); \
+  allocAndInitHipDeviceData(y, m_y, iend);
+
+#define DAXPY_ATOMIC_DATA_TEARDOWN_HIP \
+  getHipDeviceData(m_y, y, iend); \
+  deallocHipDeviceData(x); \
+  deallocHipDeviceData(y);
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void daxpy_atomic(Real_ptr y, Real_ptr x,
+                      Real_type a,
+                      Index_type iend)
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     DAXPY_ATOMIC_RAJA_BODY(RAJA::hip_atomic);
+   }
+}
+
+
+template < size_t block_size >
+void DAXPY_ATOMIC::runHipVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  DAXPY_ATOMIC_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    DAXPY_ATOMIC_DATA_SETUP_HIP;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      hipLaunchKernelGGL((daxpy_atomic<block_size>),dim3(grid_size), dim3(block_size), 0, 0, y, x, a,
+                                        iend );
+      hipErrchk( hipGetLastError() );
+
+    }
+    stopTimer();
+
+    DAXPY_ATOMIC_DATA_TEARDOWN_HIP;
+
+  } else if ( vid == Lambda_HIP ) {
+
+    DAXPY_ATOMIC_DATA_SETUP_HIP;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      auto daxpy_atomic_lambda = [=] __device__ (Index_type i) {
+        DAXPY_ATOMIC_RAJA_BODY(RAJA::hip_atomic);
+      };
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(daxpy_atomic_lambda)>),
+        grid_size, block_size, 0, 0, ibegin, iend, daxpy_atomic_lambda);
+      hipErrchk( hipGetLastError() );
+
+    }
+    stopTimer();
+
+    DAXPY_ATOMIC_DATA_TEARDOWN_HIP;
+
+  } else if ( vid == RAJA_HIP ) {
+
+    DAXPY_ATOMIC_DATA_SETUP_HIP;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >(
+        RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+        DAXPY_ATOMIC_RAJA_BODY(RAJA::hip_atomic);
+      });
+
+    }
+    stopTimer();
+
+    DAXPY_ATOMIC_DATA_TEARDOWN_HIP;
+
+  } else {
+     getCout() << "\n  DAXPY_ATOMIC : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DAXPY_ATOMIC, Hip)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_HIP
diff --git a/src/basic/DAXPY_ATOMIC-OMP.cpp b/src/basic/DAXPY_ATOMIC-OMP.cpp
new file mode 100644
index 000000000..b28330d7e
--- /dev/null
+++ b/src/basic/DAXPY_ATOMIC-OMP.cpp
@@ -0,0 +1,99 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "DAXPY_ATOMIC.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+
+void DAXPY_ATOMIC::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  DAXPY_ATOMIC_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_OpenMP : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        #pragma omp parallel for
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          #pragma omp atomic
+          y[i] += a * x[i] ;
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case Lambda_OpenMP : {
+
+      auto daxpy_atomic_lam = [=](Index_type i) {
+                         #pragma omp atomic
+                         y[i] += a * x[i] ;
+                       };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        #pragma omp parallel for
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          daxpy_atomic_lam(i);
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_OpenMP : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+          DAXPY_ATOMIC_RAJA_BODY(RAJA::omp_atomic);
+        });
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    default : {
+      getCout() << "\n  DAXPY_ATOMIC : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/DAXPY_ATOMIC-OMPTarget.cpp b/src/basic/DAXPY_ATOMIC-OMPTarget.cpp
new file mode 100644
index 000000000..7b19b0cf7
--- /dev/null
+++ b/src/basic/DAXPY_ATOMIC-OMPTarget.cpp
@@ -0,0 +1,94 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "DAXPY_ATOMIC.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+  //
+  // Define threads per team for target execution
+  //
+  const size_t threads_per_team = 256;
+
+#define DAXPY_ATOMIC_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(y, m_y, iend, did, hid);
+
+#define DAXPY_ATOMIC_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_y, y, iend, hid, did); \
+  deallocOpenMPDeviceData(x, did); \
+  deallocOpenMPDeviceData(y, did);
+
+
+void DAXPY_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  DAXPY_ATOMIC_DATA_SETUP;
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    DAXPY_ATOMIC_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(x, y) device( did )
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        #pragma omp atomic
+        y[i] += a * x[i] ;
+      }
+
+    }
+    stopTimer();
+
+    DAXPY_ATOMIC_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    DAXPY_ATOMIC_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+        RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+        DAXPY_ATOMIC_RAJA_BODY(RAJA::omp_atomic);
+      });
+
+    }
+    stopTimer();
+
+    DAXPY_ATOMIC_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     getCout() << "\n  DAXPY_ATOMIC : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/basic/DAXPY_ATOMIC-Seq.cpp b/src/basic/DAXPY_ATOMIC-Seq.cpp
new file mode 100644
index 000000000..8eabef6cd
--- /dev/null
+++ b/src/basic/DAXPY_ATOMIC-Seq.cpp
@@ -0,0 +1,93 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "DAXPY_ATOMIC.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+
+void DAXPY_ATOMIC::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  DAXPY_ATOMIC_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          DAXPY_ATOMIC_BODY;
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+#if defined(RUN_RAJA_SEQ)
+    case Lambda_Seq : {
+
+      auto daxpy_atomic_lam = [=](Index_type i) {
+                     DAXPY_ATOMIC_BODY;
+                   };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          daxpy_atomic_lam(i);
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend),
+          [=](Index_type i) {
+            DAXPY_ATOMIC_RAJA_BODY(RAJA::seq_atomic);
+        });
+
+      }
+      stopTimer();
+
+      break;
+    }
+#endif
+
+    default : {
+      getCout() << "\n  DAXPY_ATOMIC : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/DAXPY_ATOMIC.cpp b/src/basic/DAXPY_ATOMIC.cpp
new file mode 100644
index 000000000..1e5d4e00e
--- /dev/null
+++ b/src/basic/DAXPY_ATOMIC.cpp
@@ -0,0 +1,80 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "DAXPY_ATOMIC.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "common/DataUtils.hpp"
+
+namespace rajaperf
+{
+namespace basic
+{
+
+
+DAXPY_ATOMIC::DAXPY_ATOMIC(const RunParams& params)
+  : KernelBase(rajaperf::Basic_DAXPY_ATOMIC, params)
+{
+  setDefaultProblemSize(1000000);
+  setDefaultReps(500);
+
+  setActualProblemSize( getTargetProblemSize() );
+
+  setItsPerRep( getActualProblemSize() );
+  setKernelsPerRep(1);
+  setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() );
+  setFLOPsPerRep(2 * getActualProblemSize());
+
+  setUsesFeature(Forall);
+
+  setVariantDefined( Base_Seq );
+  setVariantDefined( Lambda_Seq );
+  setVariantDefined( RAJA_Seq );
+
+  setVariantDefined( Base_OpenMP );
+  setVariantDefined( Lambda_OpenMP );
+  setVariantDefined( RAJA_OpenMP );
+
+  setVariantDefined( Base_OpenMPTarget );
+  setVariantDefined( RAJA_OpenMPTarget );
+
+  setVariantDefined( Base_CUDA );
+  setVariantDefined( Lambda_CUDA );
+  setVariantDefined( RAJA_CUDA );
+
+  setVariantDefined( Base_HIP );
+  setVariantDefined( Lambda_HIP );
+  setVariantDefined( RAJA_HIP );
+}
+
+DAXPY_ATOMIC::~DAXPY_ATOMIC()
+{
+}
+
+void DAXPY_ATOMIC::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  allocAndInitDataConst(m_y, getActualProblemSize(), 0.0, vid);
+  allocAndInitData(m_x, getActualProblemSize(), vid);
+  initData(m_a);
+}
+
+void DAXPY_ATOMIC::updateChecksum(VariantID vid, size_t tune_idx)
+{
+  checksum[vid].at(tune_idx) += calcChecksum(m_y, getActualProblemSize());
+}
+
+void DAXPY_ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  (void) vid;
+  deallocData(m_x);
+  deallocData(m_y);
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/DAXPY_ATOMIC.hpp b/src/basic/DAXPY_ATOMIC.hpp
new file mode 100644
index 000000000..909939a45
--- /dev/null
+++ b/src/basic/DAXPY_ATOMIC.hpp
@@ -0,0 +1,78 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// DAXPY_ATOMIC kernel reference implementation:
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   y[i] += a * x[i] ;
+/// }
+///
+
+#ifndef RAJAPerf_Basic_DAXPY_ATOMIC_HPP
+#define RAJAPerf_Basic_DAXPY_ATOMIC_HPP
+
+#define DAXPY_ATOMIC_DATA_SETUP \
+  Real_ptr x = m_x; \
+  Real_ptr y = m_y; \
+  Real_type a = m_a;
+
+#define DAXPY_ATOMIC_BODY  \
+  y[i] += a * x[i] ;
+
+#define DAXPY_ATOMIC_RAJA_BODY(policy)  \
+  RAJA::atomicAdd<policy>(&y[i], a * x[i]);
+
+
+#include "common/KernelBase.hpp"
+
+namespace rajaperf
+{
+class RunParams;
+
+namespace basic
+{
+
+class DAXPY_ATOMIC : public KernelBase
+{
+public:
+
+  DAXPY_ATOMIC(const RunParams& params);
+
+  ~DAXPY_ATOMIC();
+
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
+
+private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
+  Real_ptr m_x;
+  Real_ptr m_y;
+  Real_type m_a;
+};
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard
diff --git a/src/basic/IF_QUAD-Cuda.cpp b/src/basic/IF_QUAD-Cuda.cpp
index 8c7f9fa11..66146371c 100644
--- a/src/basic/IF_QUAD-Cuda.cpp
+++ b/src/basic/IF_QUAD-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace basic
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define IF_QUAD_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(a, m_a, iend); \
   allocAndInitCudaDeviceData(b, m_b, iend); \
@@ -43,18 +37,22 @@ namespace basic
   deallocCudaDeviceData(x1); \
   deallocCudaDeviceData(x2);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void ifquad(Real_ptr x1, Real_ptr x2,
                        Real_ptr a, Real_ptr b, Real_ptr c,
                        Index_type iend)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < iend) {
     IF_QUAD_BODY;
   }
 }
 
 
-void IF_QUAD::runCudaVariant(VariantID vid)
+
+template < size_t block_size >
+void IF_QUAD::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -70,7 +68,7 @@ void IF_QUAD::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      ifquad<<<grid_size, block_size>>>( x1, x2, a, b, c, iend );
+      ifquad<block_size><<<grid_size, block_size>>>( x1, x2, a, b, c, iend );
       cudaErrchk( cudaGetLastError() );
 
     }
@@ -86,7 +84,7 @@ void IF_QUAD::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      lambda_cuda_forall<<<grid_size, block_size>>>(
+      lambda_cuda_forall<block_size><<<grid_size, block_size>>>(
         ibegin, iend, [=] __device__ (Index_type i) {
         IF_QUAD_BODY;
       });
@@ -115,10 +113,12 @@ void IF_QUAD::runCudaVariant(VariantID vid)
     IF_QUAD_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  IF_QUAD : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  IF_QUAD : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(IF_QUAD, Cuda)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/IF_QUAD-Hip.cpp b/src/basic/IF_QUAD-Hip.cpp
index 49557e3e8..6ded209a9 100644
--- a/src/basic/IF_QUAD-Hip.cpp
+++ b/src/basic/IF_QUAD-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace basic
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define IF_QUAD_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(a, m_a, iend); \
   allocAndInitHipDeviceData(b, m_b, iend); \
@@ -43,18 +37,22 @@ namespace basic
   deallocHipDeviceData(x1); \
   deallocHipDeviceData(x2);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void ifquad(Real_ptr x1, Real_ptr x2,
                        Real_ptr a, Real_ptr b, Real_ptr c,
                        Index_type iend)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < iend) {
     IF_QUAD_BODY;
   }
 }
 
 
-void IF_QUAD::runHipVariant(VariantID vid)
+
+template < size_t block_size >
+void IF_QUAD::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -70,7 +68,7 @@ void IF_QUAD::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((ifquad), dim3(grid_size), dim3(block_size), 0, 0,  x1, x2, a, b, c,
+      hipLaunchKernelGGL((ifquad<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  x1, x2, a, b, c,
                                           iend );
       hipErrchk( hipGetLastError() );
 
@@ -91,7 +89,7 @@ void IF_QUAD::runHipVariant(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL(lambda_hip_forall<decltype(ifquad_lambda)>,
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(ifquad_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, ifquad_lambda);
       hipErrchk( hipGetLastError() );
 
@@ -118,10 +116,12 @@ void IF_QUAD::runHipVariant(VariantID vid)
     IF_QUAD_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  IF_QUAD : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  IF_QUAD : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(IF_QUAD, Hip)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/IF_QUAD-OMP.cpp b/src/basic/IF_QUAD-OMP.cpp
index 659d8a12a..93ea37e88 100644
--- a/src/basic/IF_QUAD-OMP.cpp
+++ b/src/basic/IF_QUAD-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
 
 
-void IF_QUAD::runOpenMPVariant(VariantID vid)
+void IF_QUAD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -81,12 +81,12 @@ void IF_QUAD::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  IF_QUAD : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  IF_QUAD : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/basic/IF_QUAD-OMPTarget.cpp b/src/basic/IF_QUAD-OMPTarget.cpp
index 0a16fccc8..ca0a4ac0a 100644
--- a/src/basic/IF_QUAD-OMPTarget.cpp
+++ b/src/basic/IF_QUAD-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
@@ -45,7 +45,7 @@ namespace basic
   deallocOpenMPDeviceData(x1, did); \
   deallocOpenMPDeviceData(x2, did);
 
-void IF_QUAD::runOpenMPTargetVariant(VariantID vid)
+void IF_QUAD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -61,7 +61,7 @@ void IF_QUAD::runOpenMPTargetVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       #pragma omp target is_device_ptr(a, b, c, x1, x2) device( did )
-      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) 
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
       for (Index_type i = ibegin; i < iend; ++i ) {
         IF_QUAD_BODY;
       }
@@ -89,7 +89,7 @@ void IF_QUAD::runOpenMPTargetVariant(VariantID vid)
     IF_QUAD_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  IF_QUAD : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  IF_QUAD : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/basic/IF_QUAD-Seq.cpp b/src/basic/IF_QUAD-Seq.cpp
index 051e513b0..cb303701d 100644
--- a/src/basic/IF_QUAD-Seq.cpp
+++ b/src/basic/IF_QUAD-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
 
 
-void IF_QUAD::runSeqVariant(VariantID vid)
+void IF_QUAD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -47,7 +47,7 @@ void IF_QUAD::runSeqVariant(VariantID vid)
       break;
     }
 
-#if defined(RUN_RAJA_SEQ)     
+#if defined(RUN_RAJA_SEQ)
     case Lambda_Seq : {
 
       startTimer();
@@ -79,7 +79,7 @@ void IF_QUAD::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  IF_QUAD : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  IF_QUAD : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp
index 2baff8244..69396d330 100644
--- a/src/basic/IF_QUAD.cpp
+++ b/src/basic/IF_QUAD.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -26,7 +26,7 @@ IF_QUAD::IF_QUAD(const RunParams& params)
 
   setActualProblemSize( getTargetProblemSize() );
 
-  setItsPerRep( getActualProblemSize() ); 
+  setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
   setBytesPerRep( (2*sizeof(Real_type) + 3*sizeof(Real_type)) * getActualProblemSize() );
   setFLOPsPerRep(11 * getActualProblemSize()); // 1 sqrt
@@ -61,7 +61,7 @@ IF_QUAD::~IF_QUAD()
 {
 }
 
-void IF_QUAD::setUp(VariantID vid)
+void IF_QUAD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataRandSign(m_a, getActualProblemSize(), vid);
   allocAndInitData(m_b, getActualProblemSize(), vid);
@@ -70,13 +70,13 @@ void IF_QUAD::setUp(VariantID vid)
   allocAndInitDataConst(m_x2, getActualProblemSize(), 0.0, vid);
 }
 
-void IF_QUAD::updateChecksum(VariantID vid)
+void IF_QUAD::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_x1, getActualProblemSize(), checksum_scale_factor );
-  checksum[vid] += calcChecksum(m_x2, getActualProblemSize(), checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_x1, getActualProblemSize(), checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_x2, getActualProblemSize(), checksum_scale_factor );
 }
 
-void IF_QUAD::tearDown(VariantID vid)
+void IF_QUAD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_a);
diff --git a/src/basic/IF_QUAD.hpp b/src/basic/IF_QUAD.hpp
index dad204ce3..4d2a22c22 100644
--- a/src/basic/IF_QUAD.hpp
+++ b/src/basic/IF_QUAD.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -60,17 +60,27 @@ class IF_QUAD : public KernelBase
 
   ~IF_QUAD();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_a;
   Real_ptr m_b;
   Real_ptr m_c;
diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp
new file mode 100644
index 000000000..22e5fdaaf
--- /dev/null
+++ b/src/basic/INDEXLIST-Cuda.cpp
@@ -0,0 +1,318 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INDEXLIST.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_exchange.cuh>
+#include <cub/warp/warp_reduce.cuh>
+#include <cub/warp/warp_scan.cuh>
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+  //
+  // Define magic numbers for CUDA execution
+  //
+  const size_t warp_size = 32;
+  const size_t items_per_thread = 15;
+
+
+#define INDEXLIST_DATA_SETUP_CUDA \
+  allocAndInitCudaDeviceData(x, m_x, iend); \
+  allocAndInitCudaDeviceData(list, m_list, iend);
+
+#define INDEXLIST_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_list, list, iend); \
+  deallocCudaDeviceData(x); \
+  deallocCudaDeviceData(list);
+
+
+// perform a grid scan on val and returns the result at each thread
+// in exclusive and inclusive, note that val is used as scratch space
+template < size_t block_size, size_t items_per_thread >
+__device__ void grid_scan(const int block_id,
+                          Index_type (&val)[items_per_thread],
+                          Index_type (&exclusive)[items_per_thread],
+                          Index_type (&inclusive)[items_per_thread],
+                          Index_type* block_counts,
+                          Index_type* grid_counts,
+                          unsigned* block_readys)
+{
+  const bool first_block = (block_id == 0);
+  const bool last_block = (block_id == gridDim.x-1);
+  const bool last_thread = (threadIdx.x == block_size-1);
+  const bool last_warp = (threadIdx.x >= block_size - warp_size);
+  const int warp_index = (threadIdx.x % warp_size);
+  const unsigned warp_index_mask = (1u << warp_index);
+  const unsigned warp_index_mask_right = warp_index_mask | (warp_index_mask - 1u);
+
+  using BlockScan = cub::BlockScan<Index_type, block_size>; //, cub::BLOCK_SCAN_WARP_SCANS>;
+  using BlockExchange = cub::BlockExchange<Index_type, block_size, items_per_thread>;
+  using WarpReduce = cub::WarpReduce<Index_type, warp_size>;
+
+  union SharedStorage {
+    typename BlockScan::TempStorage block_scan_storage;
+    typename BlockExchange::TempStorage block_exchange_storage;
+    typename WarpReduce::TempStorage warp_reduce_storage;
+    volatile Index_type prev_grid_count;
+  };
+  __shared__ SharedStorage s_temp_storage;
+
+
+  BlockExchange(s_temp_storage.block_exchange_storage).StripedToBlocked(val);
+  __syncthreads();
+
+
+  BlockScan(s_temp_storage.block_scan_storage).ExclusiveSum(val, exclusive);
+  __syncthreads();
+
+  for (size_t ti = 0; ti < items_per_thread; ++ti) {
+    inclusive[ti] = exclusive[ti] + val[ti];
+  }
+
+  BlockExchange(s_temp_storage.block_exchange_storage).BlockedToStriped(exclusive);
+  __syncthreads();
+  BlockExchange(s_temp_storage.block_exchange_storage).BlockedToStriped(inclusive);
+  __syncthreads();
+  if (first_block) {
+
+    if (!last_block && last_thread) {
+      block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block
+      grid_counts[block_id] = inclusive[items_per_thread-1];  // write inclusive scan result for grid through block
+      __threadfence();                         // ensure block_counts, grid_counts ready (release)
+      atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready
+    }
+
+  } else {
+
+    if (!last_block && last_thread) {
+      block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block
+      __threadfence();                         // ensure block_counts ready (release)
+      atomicExch(&block_readys[block_id], 1u); // write block_counts is ready
+    }
+
+    // get prev_grid_count using last warp in block
+    if (last_warp) {
+
+      Index_type prev_grid_count = 0;
+
+      // accumulate previous block counts into registers of warp
+
+      int prev_block_base_id = block_id - warp_size;
+
+      unsigned prev_block_ready = 0u;
+      unsigned prev_blocks_ready_ballot = 0u;
+      unsigned prev_grids_ready_ballot = 0u;
+
+      // accumulate full warp worths of block counts
+      // stop if run out of full warps of a grid count is ready
+      while (prev_block_base_id >= 0) {
+
+        const int prev_block_id = prev_block_base_id + warp_index;
+
+        // ensure previous block_counts are ready
+        do {
+          prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u);
+
+          prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u);
+
+        } while (prev_blocks_ready_ballot != 0xffffffffu);
+
+        prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u);
+
+        if (prev_grids_ready_ballot != 0u) {
+          break;
+        }
+
+        __threadfence(); // ensure block_counts or grid_counts ready (acquire)
+
+        // accumulate block_counts for prev_block_id
+        prev_grid_count += block_counts[prev_block_id];
+
+        prev_block_ready = 0u;
+
+        prev_block_base_id -= warp_size;
+      }
+
+      const int prev_block_id = prev_block_base_id + warp_index;
+
+      // ensure previous block_counts are ready
+      // this checks that block counts is ready for all blocks above
+      // the highest grid count that is ready
+      while (~prev_blocks_ready_ballot >= prev_grids_ready_ballot) {
+
+        if (prev_block_id >= 0) {
+          prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u);
+        }
+
+        prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u);
+        prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u);
+      }
+      __threadfence(); // ensure block_counts or grid_counts ready (acquire)
+
+      // read one grid_count from a block with id grid_count_ready_id
+      // and read the block_counts from blocks with higher ids.
+      if (warp_index_mask > prev_grids_ready_ballot) {
+        // accumulate block_counts for prev_block_id
+        prev_grid_count += block_counts[prev_block_id];
+      } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) {
+        // accumulate grid_count for grid_count_ready_id
+        prev_grid_count += grid_counts[prev_block_id];
+      }
+
+
+      prev_grid_count = WarpReduce(s_temp_storage.warp_reduce_storage).Sum(prev_grid_count);
+      prev_grid_count = __shfl_sync(0xffffffffu, prev_grid_count, 0, warp_size); // broadcast output to all threads in warp
+
+      if (last_thread) {
+
+        if (!last_block) {
+          grid_counts[block_id] = prev_grid_count + inclusive[items_per_thread-1];   // write inclusive scan result for grid through block
+          __threadfence();                        // ensure grid_counts ready (release)
+          atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready
+        }
+
+        s_temp_storage.prev_grid_count = prev_grid_count;
+      }
+    }
+
+    __syncthreads();
+    Index_type prev_grid_count = s_temp_storage.prev_grid_count;
+
+    for (size_t ti = 0; ti < items_per_thread; ++ti) {
+      exclusive[ti] = prev_grid_count + exclusive[ti];
+      inclusive[ti] = prev_grid_count + inclusive[ti];
+    }
+
+    if (last_block) {
+      for (unsigned i = threadIdx.x; i < gridDim.x-1; i += block_size) {
+        while (atomicCAS(&block_readys[i], 2u, 0u) != 2u);
+      }
+    }
+  }
+}
+
+template < size_t block_size, size_t items_per_thread >
+__launch_bounds__(block_size)
+__global__ void indexlist(Real_ptr x,
+                          Int_ptr list,
+                          Index_type* block_counts,
+                          Index_type* grid_counts,
+                          unsigned* block_readys,
+                          Index_type* len,
+                          Index_type iend)
+{
+  // blocks do start running in order in cuda and hip, so a block with a higher
+  // index can wait on a block with a lower index without deadlocking
+  // (replace with an atomicInc if this changes)
+  const int block_id = blockIdx.x;
+
+  Index_type vals[items_per_thread];
+
+  for (size_t ti = 0; ti < items_per_thread; ++ti) {
+    Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x;
+    Index_type val = 0;
+    if (i < iend) {
+      if (INDEXLIST_CONDITIONAL) {
+        val = 1;
+      }
+    }
+    vals[ti] = val;
+  }
+
+  Index_type exclusives[items_per_thread];
+  Index_type inclusives[items_per_thread];
+  grid_scan<block_size, items_per_thread>(
+      block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys);
+
+  for (size_t ti = 0; ti < items_per_thread; ++ti) {
+    Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x;
+    Index_type exclusive = exclusives[ti];
+    Index_type inclusive = inclusives[ti];
+    if (i < iend) {
+      if (exclusive != inclusive) {
+        list[exclusive] = i;
+      }
+      if (i == iend-1) {
+        *len = inclusive;
+      }
+    }
+  }
+}
+
+template < size_t block_size >
+void INDEXLIST::runCudaVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  INDEXLIST_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    INDEXLIST_DATA_SETUP_CUDA;
+
+    const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*items_per_thread);
+    const size_t shmem_size = 0;
+
+    Index_type* len;
+    allocCudaPinnedData(len, 1);
+    Index_type* block_counts;
+    allocCudaDeviceData(block_counts, grid_size);
+    Index_type* grid_counts;
+    allocCudaDeviceData(grid_counts, grid_size);
+    unsigned* block_readys;
+    allocCudaDeviceData(block_readys, grid_size);
+    cudaErrchk( cudaMemset(block_readys, 0, sizeof(unsigned)*grid_size) );
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      indexlist<block_size, items_per_thread>
+          <<<grid_size, block_size, shmem_size>>>(
+          x+ibegin, list+ibegin,
+          block_counts, grid_counts, block_readys,
+          len, iend-ibegin );
+      cudaErrchk( cudaGetLastError() );
+
+      cudaErrchk( cudaDeviceSynchronize() );
+      m_len = *len;
+
+    }
+    stopTimer();
+
+    deallocCudaPinnedData(len);
+    deallocCudaDeviceData(block_counts);
+    deallocCudaDeviceData(grid_counts);
+    deallocCudaDeviceData(block_readys);
+
+    INDEXLIST_DATA_TEARDOWN_CUDA;
+
+  } else {
+    std::cout << "\n  INDEXLIST : Unknown variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INDEXLIST, Cuda)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp
new file mode 100644
index 000000000..1450244e8
--- /dev/null
+++ b/src/basic/INDEXLIST-Hip.cpp
@@ -0,0 +1,318 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INDEXLIST.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#include "common/HipDataUtils.hpp"
+
+#include <rocprim/block/block_scan.hpp>
+#include <rocprim/block/block_exchange.hpp>
+#include <rocprim/warp/warp_reduce.hpp>
+#include <rocprim/warp/warp_scan.hpp>
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+  //
+  // Define magic numbers for HIP execution
+  //
+  const size_t warp_size = 64;
+  const size_t items_per_thread = 8;
+
+
+#define INDEXLIST_DATA_SETUP_HIP \
+  allocAndInitHipDeviceData(x, m_x, iend); \
+  allocAndInitHipDeviceData(list, m_list, iend);
+
+#define INDEXLIST_DATA_TEARDOWN_HIP \
+  getHipDeviceData(m_list, list, iend); \
+  deallocHipDeviceData(x); \
+  deallocHipDeviceData(list);
+
+
+// perform a grid scan on val and returns the result at each thread
+// in exclusive and inclusive, note that val is used as scratch space
+template < size_t block_size, size_t items_per_thread >
+__device__ void grid_scan(const int block_id,
+                          Index_type (&val)[items_per_thread],
+                          Index_type (&exclusive)[items_per_thread],
+                          Index_type (&inclusive)[items_per_thread],
+                          Index_type* block_counts,
+                          Index_type* grid_counts,
+                          unsigned* block_readys)
+{
+  const bool first_block = (block_id == 0);
+  const bool last_block = (block_id == static_cast<int>(gridDim.x-1));
+  const bool last_thread = (threadIdx.x == block_size-1);
+  const bool last_warp = (threadIdx.x >= block_size - warp_size);
+  const int warp_index = (threadIdx.x % warp_size);
+  const unsigned long long warp_index_mask = (1ull << warp_index);
+  const unsigned long long warp_index_mask_right = warp_index_mask | (warp_index_mask - 1ull);
+
+  using BlockScan = rocprim::block_scan<Index_type, block_size>; //, rocprim::block_scan_algorithm::reduce_then_scan>;
+  using BlockExchange = rocprim::block_exchange<Index_type, block_size, items_per_thread>;
+  using WarpReduce = rocprim::warp_reduce<Index_type, warp_size>;
+
+  union SharedStorage {
+    typename BlockScan::storage_type block_scan_storage;
+    typename BlockExchange::storage_type block_exchange_storage;
+    typename WarpReduce::storage_type warp_reduce_storage;
+    volatile Index_type prev_grid_count;
+  };
+  __shared__ SharedStorage s_temp_storage;
+
+
+  BlockExchange().striped_to_blocked(val, val, s_temp_storage.block_exchange_storage);
+  __syncthreads();
+
+
+  BlockScan().exclusive_scan(val, exclusive, Index_type{0}, s_temp_storage.block_scan_storage);
+  __syncthreads();
+
+  for (size_t ti = 0; ti < items_per_thread; ++ti) {
+    inclusive[ti] = exclusive[ti] + val[ti];
+  }
+
+  BlockExchange().blocked_to_striped(exclusive, exclusive, s_temp_storage.block_exchange_storage);
+  __syncthreads();
+  BlockExchange().blocked_to_striped(inclusive, inclusive, s_temp_storage.block_exchange_storage);
+  __syncthreads();
+  if (first_block) {
+
+    if (!last_block && last_thread) {
+      block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block
+      grid_counts[block_id] = inclusive[items_per_thread-1];  // write inclusive scan result for grid through block
+      __threadfence();                         // ensure block_counts, grid_counts ready (release)
+      atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready
+    }
+
+  } else {
+
+    if (!last_block && last_thread) {
+      block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block
+      __threadfence();                         // ensure block_counts ready (release)
+      atomicExch(&block_readys[block_id], 1u); // write block_counts is ready
+    }
+
+    // get prev_grid_count using last warp in block
+    if (last_warp) {
+
+      Index_type prev_grid_count = 0;
+
+      // accumulate previous block counts into registers of warp
+
+      int prev_block_base_id = block_id - warp_size;
+
+      unsigned prev_block_ready = 0u;
+      unsigned long long prev_blocks_ready_ballot = 0ull;
+      unsigned long long prev_grids_ready_ballot = 0ull;
+
+      // accumulate full warp worths of block counts
+      // stop if run out of full warps of a grid count is ready
+      while (prev_block_base_id >= 0) {
+
+        const int prev_block_id = prev_block_base_id + warp_index;
+
+        // ensure previous block_counts are ready
+        do {
+          prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u);
+
+          prev_blocks_ready_ballot = __ballot(prev_block_ready >= 1u);
+
+        } while (prev_blocks_ready_ballot != 0xffffffffffffffffull);
+
+        prev_grids_ready_ballot = __ballot(prev_block_ready == 2u);
+
+        if (prev_grids_ready_ballot != 0ull) {
+          break;
+        }
+
+        __threadfence(); // ensure block_counts or grid_counts ready (acquire)
+
+        // accumulate block_counts for prev_block_id
+        prev_grid_count += block_counts[prev_block_id];
+
+        prev_block_ready = 0u;
+
+        prev_block_base_id -= warp_size;
+      }
+
+      const int prev_block_id = prev_block_base_id + warp_index;
+
+      // ensure previous block_counts are ready
+      // this checks that block counts is ready for all blocks above
+      // the highest grid count that is ready
+      while (~prev_blocks_ready_ballot >= prev_grids_ready_ballot) {
+
+        if (prev_block_id >= 0) {
+          prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u);
+        }
+
+        prev_blocks_ready_ballot = __ballot(prev_block_ready >= 1u);
+        prev_grids_ready_ballot = __ballot(prev_block_ready == 2u);
+      }
+      __threadfence(); // ensure block_counts or grid_counts ready (acquire)
+
+      // read one grid_count from a block with id grid_count_ready_id
+      // and read the block_counts from blocks with higher ids.
+      if (warp_index_mask > prev_grids_ready_ballot) {
+        // accumulate block_counts for prev_block_id
+        prev_grid_count += block_counts[prev_block_id];
+      } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) {
+        // accumulate grid_count for grid_count_ready_id
+        prev_grid_count += grid_counts[prev_block_id];
+      }
+
+
+      WarpReduce().reduce(prev_grid_count, prev_grid_count, s_temp_storage.warp_reduce_storage);
+      prev_grid_count = __shfl(prev_grid_count, 0, warp_size); // broadcast output to all threads in warp
+
+      if (last_thread) {
+
+        if (!last_block) {
+          grid_counts[block_id] = prev_grid_count + inclusive[items_per_thread-1];   // write inclusive scan result for grid through block
+          __threadfence();                        // ensure grid_counts ready (release)
+          atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready
+        }
+
+        s_temp_storage.prev_grid_count = prev_grid_count;
+      }
+    }
+
+    __syncthreads();
+    Index_type prev_grid_count = s_temp_storage.prev_grid_count;
+
+    for (size_t ti = 0; ti < items_per_thread; ++ti) {
+      exclusive[ti] = prev_grid_count + exclusive[ti];
+      inclusive[ti] = prev_grid_count + inclusive[ti];
+    }
+
+    if (last_block) {
+      for (unsigned i = threadIdx.x; i < gridDim.x-1; i += block_size) {
+        while (atomicCAS(&block_readys[i], 2u, 0u) != 2u);
+      }
+    }
+  }
+}
+
+template < size_t block_size, size_t items_per_thread >
+__launch_bounds__(block_size)
+__global__ void indexlist(Real_ptr x,
+                          Int_ptr list,
+                          Index_type* block_counts,
+                          Index_type* grid_counts,
+                          unsigned* block_readys,
+                          Index_type* len,
+                          Index_type iend)
+{
+  // blocks do start running in order in cuda and hip, so a block with a higher
+  // index can wait on a block with a lower index without deadlocking
+  // (replace with an atomicInc if this changes)
+  const int block_id = blockIdx.x;
+
+  Index_type vals[items_per_thread];
+
+  for (size_t ti = 0; ti < items_per_thread; ++ti) {
+    Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x;
+    Index_type val = 0;
+    if (i < iend) {
+      if (INDEXLIST_CONDITIONAL) {
+        val = 1;
+      }
+    }
+    vals[ti] = val;
+  }
+
+  Index_type exclusives[items_per_thread];
+  Index_type inclusives[items_per_thread];
+  grid_scan<block_size, items_per_thread>(
+      block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys);
+
+  for (size_t ti = 0; ti < items_per_thread; ++ti) {
+    Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x;
+    Index_type exclusive = exclusives[ti];
+    Index_type inclusive = inclusives[ti];
+    if (i < iend) {
+      if (exclusive != inclusive) {
+        list[exclusive] = i;
+      }
+      if (i == iend-1) {
+        *len = inclusive;
+      }
+    }
+  }
+}
+
+template < size_t block_size >
+void INDEXLIST::runHipVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  INDEXLIST_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    INDEXLIST_DATA_SETUP_HIP;
+
+    const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*items_per_thread);
+    const size_t shmem_size = 0;
+
+    Index_type* len;
+    allocHipPinnedData(len, 1);
+    Index_type* block_counts;
+    allocHipDeviceData(block_counts, grid_size);
+    Index_type* grid_counts;
+    allocHipDeviceData(grid_counts, grid_size);
+    unsigned* block_readys;
+    allocHipDeviceData(block_readys, grid_size);
+    hipErrchk( hipMemset(block_readys, 0, sizeof(unsigned)*grid_size) );
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      indexlist<block_size, items_per_thread>
+          <<<grid_size, block_size, shmem_size>>>(
+          x+ibegin, list+ibegin,
+          block_counts, grid_counts, block_readys,
+          len, iend-ibegin );
+      hipErrchk( hipGetLastError() );
+
+      hipErrchk( hipDeviceSynchronize() );
+      m_len = *len;
+
+    }
+    stopTimer();
+
+    deallocHipPinnedData(len);
+    deallocHipDeviceData(block_counts);
+    deallocHipDeviceData(grid_counts);
+    deallocHipDeviceData(block_readys);
+
+    INDEXLIST_DATA_TEARDOWN_HIP;
+
+  } else {
+    std::cout << "\n  INDEXLIST : Unknown variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INDEXLIST, Hip)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_HIP
diff --git a/src/basic/INDEXLIST-OMP.cpp b/src/basic/INDEXLIST-OMP.cpp
new file mode 100644
index 000000000..681e62699
--- /dev/null
+++ b/src/basic/INDEXLIST-OMP.cpp
@@ -0,0 +1,207 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INDEXLIST.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+void INDEXLIST::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  INDEXLIST_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_OpenMP : {
+
+#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+#else
+      const Index_type n = iend - ibegin;
+      ::std::vector<Index_type> tmp_scan(n);
+      const int p0 = static_cast<int>(std::min(n, static_cast<Index_type>(omp_get_max_threads())));
+      ::std::vector<Index_type> thread_sums(p0);
+#endif
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        Index_type count = 0;
+
+#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+        #pragma omp parallel for reduction(inscan, +:count)
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          Index_type inc = 0;
+          if (INDEXLIST_CONDITIONAL) {
+            list[count] = i ;
+            inc = 1;
+          }
+          #pragma omp scan exclusive(count)
+          count += inc;
+        }
+#else
+        #pragma omp parallel num_threads(p0)
+        {
+          const int p = omp_get_num_threads();
+          const int pid = omp_get_thread_num();
+          const Index_type step = n / p;
+          const Index_type local_begin = pid * step + ibegin;
+          const Index_type local_end = (pid == p-1) ? iend : (pid+1) * step + ibegin;
+
+          Index_type local_sum_var = 0;
+          for (Index_type i = local_begin; i < local_end; ++i ) {
+
+            Index_type inc = 0;
+            if (INDEXLIST_CONDITIONAL) {
+              inc = 1;
+            }
+            tmp_scan[i] = inc;
+            local_sum_var += inc;
+          }
+          thread_sums[pid] = local_sum_var;
+
+          #pragma omp barrier
+
+          Index_type local_count_var = 0;
+          for (int ip = 0; ip < pid; ++ip) {
+            local_count_var += thread_sums[ip];
+          }
+
+          for (Index_type i = local_begin; i < local_end; ++i ) {
+            Index_type inc = tmp_scan[i];
+            if (inc) {
+              list[local_count_var] = i ;
+            }
+            local_count_var += inc;
+          }
+
+          if (pid == p-1) {
+            count = local_count_var;
+          }
+        }
+#endif
+
+        m_len = count;
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case Lambda_OpenMP : {
+
+#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+      auto indexlist_lam = [=](Index_type i, Index_type count) {
+                                  Index_type inc = 0;
+                                  if (INDEXLIST_CONDITIONAL) {
+                                    list[count] = i ;
+                                    inc = 1;
+                                  }
+                                  return inc;
+                                };
+#else
+      auto indexlist_lam_input = [=](Index_type i) {
+                                  Index_type inc = 0;
+                                  if (INDEXLIST_CONDITIONAL) {
+                                    inc = 1;
+                                  }
+                                  return inc;
+                                };
+      auto indexlist_lam_output = [=](Index_type i, Index_type count) {
+                                  list[count] = i ;
+                                };
+      const Index_type n = iend - ibegin;
+      ::std::vector<Index_type> tmp_scan(n);
+      const int p0 = static_cast<int>(std::min(n, static_cast<Index_type>(omp_get_max_threads())));
+      ::std::vector<Index_type> thread_sums(p0);
+#endif
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        Index_type count = 0;
+
+#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+        #pragma omp parallel for reduction(inscan, +:count)
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          #pragma omp scan exclusive(count)
+          count += indexlist_lam(i, count);
+        }
+#else
+        #pragma omp parallel num_threads(p0)
+        {
+          const int p = omp_get_num_threads();
+          const int pid = omp_get_thread_num();
+          const Index_type step = n / p;
+          const Index_type local_begin = pid * step + ibegin;
+          const Index_type local_end = (pid == p-1) ? iend : (pid+1) * step + ibegin;
+
+          Index_type local_sum_var = 0;
+          for (Index_type i = local_begin; i < local_end; ++i ) {
+
+            Index_type inc = indexlist_lam_input(i);
+            tmp_scan[i] = inc;
+            local_sum_var += inc;
+          }
+          thread_sums[pid] = local_sum_var;
+
+          #pragma omp barrier
+
+          Index_type local_count_var = 0;
+          for (int ip = 0; ip < pid; ++ip) {
+            local_count_var += thread_sums[ip];
+          }
+
+          for (Index_type i = local_begin; i < local_end; ++i ) {
+            Index_type inc = tmp_scan[i];
+            if (inc) {
+              indexlist_lam_output(i, local_count_var);
+            }
+            local_count_var += inc;
+          }
+
+          if (pid == p-1) {
+            count = local_count_var;
+          }
+        }
+#endif
+
+        m_len = count;
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    default : {
+      ignore_unused(run_reps, ibegin, iend, x, list);
+      std::cout << "\n  INDEXLIST : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/INDEXLIST-OMPTarget.cpp b/src/basic/INDEXLIST-OMPTarget.cpp
new file mode 100644
index 000000000..99f875b27
--- /dev/null
+++ b/src/basic/INDEXLIST-OMPTarget.cpp
@@ -0,0 +1,100 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INDEXLIST.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) \
+ && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+
+  //
+  // Define threads per team for target execution
+  //
+  const size_t threads_per_team = 256;
+
+#define INDEXLIST_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+  \
+  allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(list, m_list, iend, did, hid);
+
+#define INDEXLIST_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_list, list, iend, hid, did); \
+  deallocOpenMPDeviceData(x, did); \
+  deallocOpenMPDeviceData(list, did);
+
+#endif
+
+
+void INDEXLIST::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) \
+ && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  INDEXLIST_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_OpenMPTarget : {
+
+      INDEXLIST_DATA_SETUP_OMP_TARGET;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        Index_type count = 0;
+        #pragma omp target is_device_ptr(x, list) device( did )
+        #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) \
+                                                  reduction(inscan, +:count)
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          Index_type inc = 0;
+          if (INDEXLIST_CONDITIONAL) {
+            list[count] = i ;
+            inc = 1;
+          }
+          #pragma omp scan exclusive(count)
+          count += inc;
+        }
+
+        m_len = count;
+
+      }
+      stopTimer();
+
+      INDEXLIST_DATA_TEARDOWN_OMP_TARGET;
+
+      break;
+    }
+
+    default : {
+      ignore_unused(run_reps, ibegin, iend, x, list);
+      std::cout << "\n  INDEXLIST : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/INDEXLIST-Seq.cpp b/src/basic/INDEXLIST-Seq.cpp
new file mode 100644
index 000000000..e7bb7139b
--- /dev/null
+++ b/src/basic/INDEXLIST-Seq.cpp
@@ -0,0 +1,84 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INDEXLIST.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+
+void INDEXLIST::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  INDEXLIST_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        Index_type count = 0;
+
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          INDEXLIST_BODY;
+        }
+
+        m_len = count;
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+#if defined(RUN_RAJA_SEQ)
+    case Lambda_Seq : {
+
+      auto indexlist_base_lam = [=](Index_type i, Index_type& count) {
+                                 INDEXLIST_BODY
+                               };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        Index_type count = 0;
+
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          indexlist_base_lam(i, count);
+        }
+
+        m_len = count;
+
+      }
+      stopTimer();
+
+      break;
+    }
+#endif
+
+    default : {
+      std::cout << "\n  INDEXLIST : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp
new file mode 100644
index 000000000..df523fbf6
--- /dev/null
+++ b/src/basic/INDEXLIST.cpp
@@ -0,0 +1,79 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INDEXLIST.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "common/DataUtils.hpp"
+
+namespace rajaperf
+{
+namespace basic
+{
+
+
+INDEXLIST::INDEXLIST(const RunParams& params)
+  : KernelBase(rajaperf::Basic_INDEXLIST, params)
+{
+  setDefaultProblemSize(1000000);
+  setDefaultReps(100);
+
+  setActualProblemSize( getTargetProblemSize() );
+
+  setItsPerRep( getActualProblemSize() );
+  setKernelsPerRep(1);
+  setBytesPerRep( (1*sizeof(Index_type) + 1*sizeof(Index_type)) +
+                  (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getActualProblemSize() / 2 + // about 50% output
+                  (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() );
+  setFLOPsPerRep(0);
+
+  setUsesFeature(Forall);
+  setUsesFeature(Scan);
+
+  setVariantDefined( Base_Seq );
+  setVariantDefined( Lambda_Seq );
+
+  setVariantDefined( Base_OpenMP );
+  setVariantDefined( Lambda_OpenMP );
+
+#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+  setVariantDefined( Base_OpenMPTarget );
+#endif
+
+  setVariantDefined( Base_CUDA );
+
+  setVariantDefined( Base_HIP );
+}
+
+INDEXLIST::~INDEXLIST()
+{
+}
+
+void INDEXLIST::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  allocAndInitDataRandSign(m_x, getActualProblemSize(), vid);
+  allocAndInitData(m_list, getActualProblemSize(), vid);
+  m_len = -1;
+}
+
+void INDEXLIST::updateChecksum(VariantID vid, size_t tune_idx)
+{
+  checksum[vid][tune_idx] += calcChecksum(m_list, getActualProblemSize());
+  checksum[vid][tune_idx] += Checksum_type(m_len);
+}
+
+void INDEXLIST::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  (void) vid;
+  deallocData(m_x);
+  deallocData(m_list);
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/INDEXLIST.hpp b/src/basic/INDEXLIST.hpp
new file mode 100644
index 000000000..0836d8197
--- /dev/null
+++ b/src/basic/INDEXLIST.hpp
@@ -0,0 +1,83 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// INDEXLIST kernel reference implementation:
+///
+/// Index_type count = 0;
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   if (x[i] < 0.0) {
+///     list[count++] = i ;
+///   }
+/// }
+/// Index_type len = count;
+///
+
+#ifndef RAJAPerf_Basic_INDEXLIST_HPP
+#define RAJAPerf_Basic_INDEXLIST_HPP
+
+#define INDEXLIST_DATA_SETUP \
+  Real_ptr x = m_x; \
+  Int_ptr list = m_list;
+
+#define INDEXLIST_CONDITIONAL  \
+  x[i] < 0.0
+
+#define INDEXLIST_BODY  \
+  if (INDEXLIST_CONDITIONAL) { \
+    list[count++] = i ; \
+  }
+
+
+#include "common/KernelBase.hpp"
+
+namespace rajaperf
+{
+class RunParams;
+
+namespace basic
+{
+
+class INDEXLIST : public KernelBase
+{
+public:
+
+  INDEXLIST(const RunParams& params);
+
+  ~INDEXLIST();
+
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
+
+private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::list_type<default_gpu_block_size>;
+
+  Real_ptr m_x;
+  Int_ptr m_list;
+  Index_type m_len;
+};
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard
diff --git a/src/basic/INDEXLIST_3LOOP-Cuda.cpp b/src/basic/INDEXLIST_3LOOP-Cuda.cpp
new file mode 100644
index 000000000..22e263b4f
--- /dev/null
+++ b/src/basic/INDEXLIST_3LOOP-Cuda.cpp
@@ -0,0 +1,178 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INDEXLIST_3LOOP.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+#define INDEXLIST_3LOOP_DATA_SETUP_CUDA \
+  Index_type* counts; \
+  allocCudaDeviceData(counts, iend+1); \
+  allocAndInitCudaDeviceData(x, m_x, iend); \
+  allocAndInitCudaDeviceData(list, m_list, iend);
+
+#define INDEXLIST_3LOOP_DATA_TEARDOWN_CUDA \
+  deallocCudaDeviceData(counts); \
+  getCudaDeviceData(m_list, list, iend); \
+  deallocCudaDeviceData(x); \
+  deallocCudaDeviceData(list);
+
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void indexlist_conditional(Real_ptr x,
+                                      Index_type* counts,
+                                      Index_type iend)
+{
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
+  if (i < iend) {
+    counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0;
+  }
+}
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void indexlist_make_list(Int_ptr list,
+                                    Index_type* counts,
+                                    Index_type* len,
+                                    Index_type iend)
+{
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
+  if (i < iend) {
+    INDEXLIST_3LOOP_MAKE_LIST;
+    if (i == iend-1) {
+      *len = counts[i+1];
+    }
+  }
+}
+
+
+template < size_t block_size >
+void INDEXLIST_3LOOP::runCudaVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  INDEXLIST_3LOOP_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    INDEXLIST_3LOOP_DATA_SETUP_CUDA;
+
+    Index_type* len;
+    allocCudaPinnedData(len, 1);
+
+    cudaStream_t stream = RAJA::resources::Cuda::get_default().get_stream();
+
+    RAJA::operators::plus<Index_type> binary_op;
+    Index_type init_val = 0;
+    int scan_size = iend+1 - ibegin;
+    void* d_temp_storage = nullptr;
+    size_t temp_storage_bytes = 0;
+    cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
+                                                temp_storage_bytes,
+                                                counts+ibegin,
+                                                counts+ibegin,
+                                                binary_op,
+                                                init_val,
+                                                scan_size,
+                                                stream));
+
+    unsigned char* temp_storage;
+    allocCudaDeviceData(temp_storage, temp_storage_bytes);
+    d_temp_storage = temp_storage;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      indexlist_conditional<block_size><<<grid_size, block_size, 0, stream>>>(
+          x, counts, iend );
+      cudaErrchk( cudaGetLastError() );
+
+      cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
+                                                  temp_storage_bytes,
+                                                  counts+ibegin,
+                                                  counts+ibegin,
+                                                  binary_op,
+                                                  init_val,
+                                                  scan_size,
+                                                  stream));
+
+      indexlist_make_list<block_size><<<grid_size, block_size, 0, stream>>>(
+          list, counts, len, iend );
+      cudaErrchk( cudaGetLastError() );
+
+      cudaErrchk( cudaStreamSynchronize(stream) );
+      m_len = *len;
+
+    }
+    stopTimer();
+
+    deallocCudaDeviceData(temp_storage);
+    deallocCudaPinnedData(len);
+
+    INDEXLIST_3LOOP_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    INDEXLIST_3LOOP_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::ReduceSum<RAJA::cuda_reduce, Index_type> len(0);
+
+      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+        RAJA::RangeSegment(ibegin, iend),
+        [=] __device__ (Index_type i) {
+        counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0;
+      });
+
+      RAJA::exclusive_scan_inplace< RAJA::cuda_exec<block_size, true /*async*/> >(
+          RAJA::make_span(counts+ibegin, iend+1-ibegin));
+
+      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+        RAJA::RangeSegment(ibegin, iend),
+        [=] __device__ (Index_type i) {
+        if (counts[i] != counts[i+1]) {
+          list[counts[i]] = i;
+          len += 1;
+        }
+      });
+
+      m_len = len.get();
+
+    }
+    stopTimer();
+
+    INDEXLIST_3LOOP_DATA_TEARDOWN_CUDA;
+
+  } else {
+    std::cout << "\n  INDEXLIST_3LOOP : Unknown variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INDEXLIST_3LOOP, Cuda)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/basic/INDEXLIST_3LOOP-Hip.cpp b/src/basic/INDEXLIST_3LOOP-Hip.cpp
new file mode 100644
index 000000000..205b662dd
--- /dev/null
+++ b/src/basic/INDEXLIST_3LOOP-Hip.cpp
@@ -0,0 +1,200 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INDEXLIST_3LOOP.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#include "common/HipDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+#define INDEXLIST_3LOOP_DATA_SETUP_HIP \
+  Index_type* counts; \
+  allocHipDeviceData(counts, iend+1); \
+  allocAndInitHipDeviceData(x, m_x, iend); \
+  allocAndInitHipDeviceData(list, m_list, iend);
+
+#define INDEXLIST_3LOOP_DATA_TEARDOWN_HIP \
+  deallocHipDeviceData(counts); \
+  getHipDeviceData(m_list, list, iend); \
+  deallocHipDeviceData(x); \
+  deallocHipDeviceData(list);
+
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void indexlist_conditional(Real_ptr x,
+                                      Index_type* counts,
+                                      Index_type iend)
+{
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
+  if (i < iend) {
+    counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0;
+  }
+}
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void indexlist_make_list(Int_ptr list,
+                                    Index_type* counts,
+                                    Index_type* len,
+                                    Index_type iend)
+{
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
+  if (i < iend) {
+    INDEXLIST_3LOOP_MAKE_LIST;
+    if (i == iend-1) {
+      *len = counts[i+1];
+    }
+  }
+}
+
+
+template < size_t block_size >
+void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  INDEXLIST_3LOOP_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    INDEXLIST_3LOOP_DATA_SETUP_HIP;
+
+    Index_type* len;
+    allocHipPinnedData(len, 1);
+
+    hipStream_t stream = RAJA::resources::Hip::get_default().get_stream();
+
+    RAJA::operators::plus<Index_type> binary_op;
+    Index_type init_val = 0;
+    int scan_size = iend+1 - ibegin;
+    void* d_temp_storage = nullptr;
+    size_t temp_storage_bytes = 0;
+#if defined(__HIPCC__)
+    hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
+                                        temp_storage_bytes,
+                                        counts+ibegin,
+                                        counts+ibegin,
+                                        init_val,
+                                        scan_size,
+                                        binary_op,
+                                        stream));
+#elif defined(__CUDACC__)
+    hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
+                                               temp_storage_bytes,
+                                               counts+ibegin,
+                                               counts+ibegin,
+                                               binary_op,
+                                               init_val,
+                                               scan_size,
+                                               stream));
+#endif
+
+    unsigned char* temp_storage;
+    allocHipDeviceData(temp_storage, temp_storage_bytes);
+    d_temp_storage = temp_storage;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      hipLaunchKernelGGL((indexlist_conditional<block_size>), grid_size, block_size, 0, stream,
+          x, counts, iend );
+      hipErrchk( hipGetLastError() );
+
+#if defined(__HIPCC__)
+      hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
+                                          temp_storage_bytes,
+                                          counts+ibegin,
+                                          counts+ibegin,
+                                          init_val,
+                                          scan_size,
+                                          binary_op,
+                                          stream));
+#elif defined(__CUDACC__)
+      hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
+                                                 temp_storage_bytes,
+                                                 counts+ibegin,
+                                                 counts+ibegin,
+                                                 binary_op,
+                                                 init_val,
+                                                 scan_size,
+                                                 stream));
+#endif
+
+      hipLaunchKernelGGL((indexlist_make_list<block_size>), grid_size, block_size, 0, stream,
+          list, counts, len, iend );
+      hipErrchk( hipGetLastError() );
+
+      hipErrchk( hipStreamSynchronize(stream) );
+      m_len = *len;
+
+    }
+    stopTimer();
+
+    deallocHipDeviceData(temp_storage);
+    deallocHipPinnedData(len);
+
+    INDEXLIST_3LOOP_DATA_TEARDOWN_HIP;
+
+  } else if ( vid == RAJA_HIP ) {
+
+    INDEXLIST_3LOOP_DATA_SETUP_HIP;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::ReduceSum<RAJA::hip_reduce, Index_type> len(0);
+
+      RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >(
+        RAJA::RangeSegment(ibegin, iend),
+        [=] __device__ (Index_type i) {
+        counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0;
+      });
+
+      RAJA::exclusive_scan_inplace< RAJA::hip_exec<block_size, true /*async*/> >(
+          RAJA::make_span(counts+ibegin, iend+1-ibegin));
+
+      RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >(
+        RAJA::RangeSegment(ibegin, iend),
+        [=] __device__ (Index_type i) {
+        if (counts[i] != counts[i+1]) {
+          list[counts[i]] = i;
+          len += 1;
+        }
+      });
+
+      m_len = len.get();
+
+    }
+    stopTimer();
+
+    INDEXLIST_3LOOP_DATA_TEARDOWN_HIP;
+
+  } else {
+    std::cout << "\n  INDEXLIST_3LOOP : Unknown variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INDEXLIST_3LOOP, Hip)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_HIP
diff --git a/src/basic/INDEXLIST_3LOOP-OMP.cpp b/src/basic/INDEXLIST_3LOOP-OMP.cpp
new file mode 100644
index 000000000..3ba12ea0a
--- /dev/null
+++ b/src/basic/INDEXLIST_3LOOP-OMP.cpp
@@ -0,0 +1,248 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INDEXLIST_3LOOP.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+#define INDEXLIST_3LOOP_DATA_SETUP_OMP \
+  Index_type* counts = new Index_type[iend+1];
+
+#define INDEXLIST_3LOOP_DATA_TEARDOWN_OMP \
+  delete[] counts; counts = nullptr;
+
+
+void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  INDEXLIST_3LOOP_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_OpenMP : {
+
+      INDEXLIST_3LOOP_DATA_SETUP_OMP;
+
+#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+#else
+      const Index_type n = iend+1 - ibegin;
+      const int p0 = static_cast<int>(std::min(n, static_cast<Index_type>(omp_get_max_threads())));
+      ::std::vector<Index_type> thread_counts(p0);
+#endif
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        #pragma omp parallel for
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0;
+        }
+
+#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+        Index_type count = 0;
+        #pragma omp parallel for reduction(inscan, +:count)
+        for (Index_type i = ibegin; i < iend+1; ++i ) {
+          Index_type inc = counts[i];
+          counts[i] = count;
+          #pragma omp scan exclusive(count)
+          count += inc;
+        }
+#else
+        #pragma omp parallel num_threads(p0)
+        {
+          const int p = omp_get_num_threads();
+          const int pid = omp_get_thread_num();
+          const Index_type step = n / p;
+          const Index_type local_begin = pid * step + ibegin;
+          const Index_type local_end = (pid == p-1) ? iend+1 : (pid+1) * step + ibegin;
+
+          Index_type local_count = 0;
+          for (Index_type i = local_begin; i < local_end; ++i ) {
+            Index_type inc = counts[i];
+            counts[i] = local_count;
+            local_count += inc;
+          }
+          thread_counts[pid] = local_count;
+
+          #pragma omp barrier
+
+          if (pid != 0) {
+
+            Index_type prev_count = 0;
+            for (int ip = 0; ip < pid; ++ip) {
+              prev_count += thread_counts[ip];
+            }
+
+            for (Index_type i = local_begin; i < local_end; ++i ) {
+              counts[i] += prev_count;
+            }
+          }
+        }
+#endif
+
+        #pragma omp parallel for
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          INDEXLIST_3LOOP_MAKE_LIST;
+        }
+
+        m_len = counts[iend];
+
+      }
+      stopTimer();
+
+      INDEXLIST_3LOOP_DATA_TEARDOWN_OMP;
+
+      break;
+    }
+
+    case Lambda_OpenMP : {
+
+      INDEXLIST_3LOOP_DATA_SETUP_OMP;
+
+      auto indexlist_conditional_lam = [=](Index_type i) {
+                                  counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0;
+                                };
+
+      auto indexlist_make_list_lam = [=](Index_type i) {
+                                  INDEXLIST_3LOOP_MAKE_LIST;
+                                };
+
+#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+#else
+      const Index_type n = iend+1 - ibegin;
+      const int p0 = static_cast<int>(std::min(n, static_cast<Index_type>(omp_get_max_threads())));
+      ::std::vector<Index_type> thread_counts(p0);
+#endif
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        #pragma omp parallel for
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          indexlist_conditional_lam(i);
+        }
+
+#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+        Index_type count = 0;
+        #pragma omp parallel for reduction(inscan, +:count)
+        for (Index_type i = ibegin; i < iend+1; ++i ) {
+          Index_type inc = counts[i];
+          counts[i] = count;
+          #pragma omp scan exclusive(count)
+          count += inc;
+        }
+#else
+        #pragma omp parallel num_threads(p0)
+        {
+          const int p = omp_get_num_threads();
+          const int pid = omp_get_thread_num();
+          const Index_type step = n / p;
+          const Index_type local_begin = pid * step + ibegin;
+          const Index_type local_end = (pid == p-1) ? iend+1 : (pid+1) * step + ibegin;
+
+          Index_type local_count = 0;
+          for (Index_type i = local_begin; i < local_end; ++i ) {
+            Index_type inc = counts[i];
+            counts[i] = local_count;
+            local_count += inc;
+          }
+          thread_counts[pid] = local_count;
+
+          #pragma omp barrier
+
+          if (pid != 0) {
+
+            Index_type prev_count = 0;
+            for (int ip = 0; ip < pid; ++ip) {
+              prev_count += thread_counts[ip];
+            }
+
+            for (Index_type i = local_begin; i < local_end; ++i ) {
+              counts[i] += prev_count;
+            }
+          }
+        }
+#endif
+
+        #pragma omp parallel for
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          indexlist_make_list_lam(i);
+        }
+
+        m_len = counts[iend];
+
+      }
+      stopTimer();
+
+      INDEXLIST_3LOOP_DATA_TEARDOWN_OMP;
+
+      break;
+    }
+
+    case RAJA_OpenMP : {
+
+      INDEXLIST_3LOOP_DATA_SETUP_OMP;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::ReduceSum<RAJA::omp_reduce, Index_type> len(0);
+
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend),
+          [=](Index_type i) {
+          counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0;
+        });
+
+        RAJA::exclusive_scan_inplace<RAJA::omp_parallel_for_exec>(
+            RAJA::make_span(counts+ibegin, iend+1-ibegin));
+
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend),
+          [=](Index_type i) {
+          if (counts[i] != counts[i+1]) {
+            list[counts[i]] = i;
+            len += 1;
+          }
+        });
+
+        m_len = len.get();
+
+      }
+      stopTimer();
+
+      INDEXLIST_3LOOP_DATA_TEARDOWN_OMP;
+
+      break;
+    }
+
+    default : {
+      std::cout << "\n  INDEXLIST_3LOOP : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp b/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp
new file mode 100644
index 000000000..d58dbe9e6
--- /dev/null
+++ b/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp
@@ -0,0 +1,113 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INDEXLIST_3LOOP.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) \
+ && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+
+  //
+  // Define threads per team for target execution
+  //
+  const size_t threads_per_team = 256;
+
+#define INDEXLIST_3LOOP_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+  \
+  Index_type* counts = nullptr; \
+  allocOpenMPDeviceData(counts, iend+1, did); \
+  allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(list, m_list, iend, did, hid);
+
+#define INDEXLIST_3LOOP_DATA_TEARDOWN_OMP_TARGET \
+  deallocOpenMPDeviceData(counts, did); \
+  getOpenMPDeviceData(m_list, list, iend, hid, did); \
+  deallocOpenMPDeviceData(x, did); \
+  deallocOpenMPDeviceData(list, did);
+
+#endif
+
+
+void INDEXLIST_3LOOP::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) \
+ && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  INDEXLIST_3LOOP_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_OpenMPTarget : {
+
+      INDEXLIST_3LOOP_DATA_SETUP_OMP_TARGET;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        #pragma omp parallel for
+
+        #pragma omp target is_device_ptr(counts, x) device( did )
+        #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0;
+        }
+
+        Index_type count = 0;
+        #pragma omp target is_device_ptr(counts) device( did )
+        #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) \
+                                                  reduction(inscan, +:count)
+        for (Index_type i = ibegin; i < iend+1; ++i ) {
+          Index_type inc = counts[i];
+          counts[i] = count;
+          #pragma omp scan exclusive(count)
+          count += inc;
+        }
+
+        #pragma omp target is_device_ptr(counts, list) device( did )
+        #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          INDEXLIST_3LOOP_MAKE_LIST;
+        }
+
+        m_len = counts[iend];
+
+      }
+      stopTimer();
+
+      INDEXLIST_3LOOP_DATA_TEARDOWN_OMP_TARGET;
+
+      break;
+    }
+
+    default : {
+      std::cout << "\n  INDEXLIST_3LOOP : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/INDEXLIST_3LOOP-Seq.cpp b/src/basic/INDEXLIST_3LOOP-Seq.cpp
new file mode 100644
index 000000000..14f62a8a7
--- /dev/null
+++ b/src/basic/INDEXLIST_3LOOP-Seq.cpp
@@ -0,0 +1,160 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INDEXLIST_3LOOP.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+#define INDEXLIST_3LOOP_DATA_SETUP_Seq \
+  Index_type* counts = new Index_type[iend+1];
+
+#define INDEXLIST_3LOOP_DATA_TEARDOWN_Seq \
+  delete[] counts; counts = nullptr;
+
+
+
+void INDEXLIST_3LOOP::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  INDEXLIST_3LOOP_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_Seq : {
+
+      INDEXLIST_3LOOP_DATA_SETUP_Seq;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0;
+        }
+
+        Index_type count = 0;
+
+        for (Index_type i = ibegin; i < iend+1; ++i ) {
+          Index_type inc = counts[i];
+          counts[i] = count;
+          count += inc;
+        }
+
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          INDEXLIST_3LOOP_MAKE_LIST;
+        }
+
+        m_len = counts[iend];
+
+      }
+      stopTimer();
+
+      INDEXLIST_3LOOP_DATA_TEARDOWN_Seq;
+
+      break;
+    }
+
+#if defined(RUN_RAJA_SEQ)
+    case Lambda_Seq : {
+
+      INDEXLIST_3LOOP_DATA_SETUP_Seq;
+
+      auto indexlist_conditional_lam = [=](Index_type i) {
+                                  counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0;
+                                };
+
+      auto indexlist_make_list_lam = [=](Index_type i) {
+                                  INDEXLIST_3LOOP_MAKE_LIST;
+                                };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          indexlist_conditional_lam(i);
+        }
+
+        Index_type count = 0;
+
+        for (Index_type i = ibegin; i < iend+1; ++i ) {
+          Index_type inc = counts[i];
+          counts[i] = count;
+          count += inc;
+        }
+
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          indexlist_make_list_lam(i);
+        }
+
+        m_len = counts[iend];
+
+      }
+      stopTimer();
+
+      INDEXLIST_3LOOP_DATA_TEARDOWN_Seq;
+
+      break;
+    }
+
+    case RAJA_Seq : {
+
+      INDEXLIST_3LOOP_DATA_SETUP_Seq;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::ReduceSum<RAJA::loop_reduce, Index_type> len(0);
+
+        RAJA::forall<RAJA::loop_exec>(
+          RAJA::RangeSegment(ibegin, iend),
+          [=](Index_type i) {
+          counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0;
+        });
+
+        RAJA::exclusive_scan_inplace<RAJA::loop_exec>(
+            RAJA::make_span(counts+ibegin, iend+1-ibegin));
+
+        RAJA::forall<RAJA::loop_exec>(
+          RAJA::RangeSegment(ibegin, iend),
+          [=](Index_type i) {
+          if (counts[i] != counts[i+1]) {
+            list[counts[i]] = i;
+            len += 1;
+          }
+        });
+
+        m_len = len.get();
+
+      }
+      stopTimer();
+
+      INDEXLIST_3LOOP_DATA_TEARDOWN_Seq;
+
+      break;
+    }
+#endif
+
+    default : {
+      std::cout << "\n  INDEXLIST_3LOOP : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp
new file mode 100644
index 000000000..e7d4215fa
--- /dev/null
+++ b/src/basic/INDEXLIST_3LOOP.cpp
@@ -0,0 +1,88 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INDEXLIST_3LOOP.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "common/DataUtils.hpp"
+
+namespace rajaperf
+{
+namespace basic
+{
+
+
+INDEXLIST_3LOOP::INDEXLIST_3LOOP(const RunParams& params)
+  : KernelBase(rajaperf::Basic_INDEXLIST_3LOOP, params)
+{
+  setDefaultProblemSize(1000000);
+  setDefaultReps(100);
+
+  setActualProblemSize( getTargetProblemSize() );
+
+  setItsPerRep( 3 * getActualProblemSize() + 1 );
+  setKernelsPerRep(3);
+  setBytesPerRep( (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getActualProblemSize() +
+                  (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() +
+
+                  (1*sizeof(Index_type) + 1*sizeof(Index_type)) +
+                  (1*sizeof(Int_type) + 1*sizeof(Int_type)) * (getActualProblemSize()+1) +
+
+                  (0*sizeof(Int_type) + 1*sizeof(Int_type)) * (getActualProblemSize()+1) +
+                  (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getActualProblemSize() / 2 ); // about 50% output
+  setFLOPsPerRep(0);
+
+  setUsesFeature(Forall);
+  setUsesFeature(Scan);
+
+  setVariantDefined( Base_Seq );
+  setVariantDefined( Lambda_Seq );
+  setVariantDefined( RAJA_Seq );
+
+  setVariantDefined( Base_OpenMP );
+  setVariantDefined( Lambda_OpenMP );
+  setVariantDefined( RAJA_OpenMP );
+
+#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN)
+  setVariantDefined( Base_OpenMPTarget );
+#endif
+
+  setVariantDefined( Base_CUDA );
+  setVariantDefined( RAJA_CUDA );
+
+  setVariantDefined( Base_HIP );
+  setVariantDefined( RAJA_HIP );
+}
+
+INDEXLIST_3LOOP::~INDEXLIST_3LOOP()
+{
+}
+
+void INDEXLIST_3LOOP::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  allocAndInitDataRandSign(m_x, getActualProblemSize(), vid);
+  allocAndInitData(m_list, getActualProblemSize(), vid);
+  m_len = -1;
+}
+
+void INDEXLIST_3LOOP::updateChecksum(VariantID vid, size_t tune_idx)
+{
+  checksum[vid][tune_idx] += calcChecksum(m_list, getActualProblemSize());
+  checksum[vid][tune_idx] += Checksum_type(m_len);
+}
+
+void INDEXLIST_3LOOP::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  (void) vid;
+  deallocData(m_x);
+  deallocData(m_list);
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/INDEXLIST_3LOOP.hpp b/src/basic/INDEXLIST_3LOOP.hpp
new file mode 100644
index 000000000..e19ee5508
--- /dev/null
+++ b/src/basic/INDEXLIST_3LOOP.hpp
@@ -0,0 +1,94 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// INDEXLIST_3LOOP kernel reference implementation:
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   counts[i] = (x[i] < 0.0) ? 1 : 0;
+/// }
+///
+/// Index_type count = 0;
+/// for (Index_type i = ibegin; i < iend+1; ++i ) {
+///   Index_type inc = counts[i];
+///   counts[i] = count;
+///   count += inc;
+/// }
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   if (counts[i] != counts[i+1]) {
+///     list[counts[i]] = i;
+///   }
+/// }
+///
+/// Index_type len = counts[iend];
+///
+
+#ifndef RAJAPerf_Basic_INDEXLIST_3LOOP_HPP
+#define RAJAPerf_Basic_INDEXLIST_3LOOP_HPP
+
+#define INDEXLIST_3LOOP_DATA_SETUP \
+  Real_ptr x = m_x; \
+  Int_ptr list = m_list;
+
+#define INDEXLIST_3LOOP_CONDITIONAL \
+  x[i] < 0.0
+
+#define INDEXLIST_3LOOP_MAKE_LIST \
+  if (counts[i] != counts[i+1]) { \
+    list[counts[i]] = i ; \
+  }
+
+
+#include "common/KernelBase.hpp"
+
+namespace rajaperf
+{
+class RunParams;
+
+namespace basic
+{
+
+class INDEXLIST_3LOOP : public KernelBase
+{
+public:
+
+  INDEXLIST_3LOOP(const RunParams& params);
+
+  ~INDEXLIST_3LOOP();
+
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
+
+private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::list_type<default_gpu_block_size>;
+
+  Real_ptr m_x;
+  Int_ptr m_list;
+  Index_type m_len;
+};
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard
diff --git a/src/basic/INIT3-Cuda.cpp b/src/basic/INIT3-Cuda.cpp
index cee3e46af..212a1e3a2 100644
--- a/src/basic/INIT3-Cuda.cpp
+++ b/src/basic/INIT3-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace basic
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define INIT3_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(out1, m_out1, iend); \
   allocAndInitCudaDeviceData(out2, m_out2, iend); \
@@ -44,18 +38,22 @@ namespace basic
   deallocCudaDeviceData(in1); \
   deallocCudaDeviceData(in2);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void init3(Real_ptr out1, Real_ptr out2, Real_ptr out3,
                       Real_ptr in1, Real_ptr in2,
                       Index_type iend)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < iend) {
     INIT3_BODY;
   }
 }
 
 
-void INIT3::runCudaVariant(VariantID vid)
+
+template < size_t block_size >
+void INIT3::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -71,7 +69,7 @@ void INIT3::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      init3<<<grid_size, block_size>>>( out1, out2, out3, in1, in2,
+      init3<block_size><<<grid_size, block_size>>>( out1, out2, out3, in1, in2,
                                         iend );
       cudaErrchk( cudaGetLastError() );
 
@@ -88,7 +86,7 @@ void INIT3::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      lambda_cuda_forall<<<grid_size, block_size>>>(
+      lambda_cuda_forall<block_size><<<grid_size, block_size>>>(
         ibegin, iend, [=] __device__ (Index_type i) {
         INIT3_BODY;
       });
@@ -117,10 +115,12 @@ void INIT3::runCudaVariant(VariantID vid)
     INIT3_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  INIT3 : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  INIT3 : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INIT3, Cuda)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/INIT3-Hip.cpp b/src/basic/INIT3-Hip.cpp
index 51e0f2b54..af3276a7d 100644
--- a/src/basic/INIT3-Hip.cpp
+++ b/src/basic/INIT3-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace basic
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define INIT3_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(out1, m_out1, iend); \
   allocAndInitHipDeviceData(out2, m_out2, iend); \
@@ -44,18 +38,22 @@ namespace basic
   deallocHipDeviceData(in1); \
   deallocHipDeviceData(in2);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void init3(Real_ptr out1, Real_ptr out2, Real_ptr out3,
                       Real_ptr in1, Real_ptr in2,
                       Index_type iend)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < iend) {
     INIT3_BODY;
   }
 }
 
 
-void INIT3::runHipVariant(VariantID vid)
+
+template < size_t block_size >
+void INIT3::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -71,7 +69,7 @@ void INIT3::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((init3), dim3(grid_size), dim3(block_size), 0, 0,  out1, out2, out3, in1, in2,
+      hipLaunchKernelGGL((init3<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  out1, out2, out3, in1, in2,
                                         iend );
       hipErrchk( hipGetLastError() );
 
@@ -92,7 +90,7 @@ void INIT3::runHipVariant(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL(lambda_hip_forall<decltype(init3_lambda)>,
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(init3_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, init3_lambda);
       hipErrchk( hipGetLastError() );
 
@@ -119,10 +117,12 @@ void INIT3::runHipVariant(VariantID vid)
     INIT3_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  INIT3 : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  INIT3 : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INIT3, Hip)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/INIT3-OMP.cpp b/src/basic/INIT3-OMP.cpp
index 7d05f9af4..8df233cc5 100644
--- a/src/basic/INIT3-OMP.cpp
+++ b/src/basic/INIT3-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
 
 
-void INIT3::runOpenMPVariant(VariantID vid)
+void INIT3::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -81,12 +81,12 @@ void INIT3::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  INIT3 : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  INIT3 : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/basic/INIT3-OMPTarget.cpp b/src/basic/INIT3-OMPTarget.cpp
index 7d3f9ce05..d2b5eb127 100644
--- a/src/basic/INIT3-OMPTarget.cpp
+++ b/src/basic/INIT3-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
@@ -47,7 +47,7 @@ namespace basic
   deallocOpenMPDeviceData(in2, did);
 
 
-void INIT3::runOpenMPTargetVariant(VariantID vid)
+void INIT3::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -89,9 +89,9 @@ void INIT3::runOpenMPTargetVariant(VariantID vid)
     stopTimer();
 
     INIT3_DATA_TEARDOWN_OMP_TARGET;
-  
+
   } else {
-     std::cout << "\n  INIT3 : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  INIT3 : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/basic/INIT3-Seq.cpp b/src/basic/INIT3-Seq.cpp
index b4c481632..1a1cb228a 100644
--- a/src/basic/INIT3-Seq.cpp
+++ b/src/basic/INIT3-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,18 +12,18 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
 
 
-void INIT3::runSeqVariant(VariantID vid)
+void INIT3::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
-  
+
   INIT3_DATA_SETUP;
 
   auto init3_lam = [=](Index_type i) {
@@ -79,7 +79,7 @@ void INIT3::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  INIT3 : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  INIT3 : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp
index cb3c14132..fc3fd024d 100644
--- a/src/basic/INIT3.cpp
+++ b/src/basic/INIT3.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -57,7 +57,7 @@ INIT3::~INIT3()
 {
 }
 
-void INIT3::setUp(VariantID vid)
+void INIT3::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_out1, getActualProblemSize(), 0.0, vid);
   allocAndInitDataConst(m_out2, getActualProblemSize(), 0.0, vid);
@@ -66,14 +66,14 @@ void INIT3::setUp(VariantID vid)
   allocAndInitData(m_in2, getActualProblemSize(), vid);
 }
 
-void INIT3::updateChecksum(VariantID vid)
+void INIT3::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_out1, getActualProblemSize());
-  checksum[vid] += calcChecksum(m_out2, getActualProblemSize());
-  checksum[vid] += calcChecksum(m_out3, getActualProblemSize());
+  checksum[vid][tune_idx] += calcChecksum(m_out1, getActualProblemSize());
+  checksum[vid][tune_idx] += calcChecksum(m_out2, getActualProblemSize());
+  checksum[vid][tune_idx] += calcChecksum(m_out3, getActualProblemSize());
 }
 
-void INIT3::tearDown(VariantID vid)
+void INIT3::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_out1);
diff --git a/src/basic/INIT3.hpp b/src/basic/INIT3.hpp
index 9d9de78da..44f3622de 100644
--- a/src/basic/INIT3.hpp
+++ b/src/basic/INIT3.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -46,17 +46,27 @@ class INIT3 : public KernelBase
 
   ~INIT3();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_out1;
   Real_ptr m_out2;
   Real_ptr m_out3;
diff --git a/src/basic/INIT_VIEW1D-Cuda.cpp b/src/basic/INIT_VIEW1D-Cuda.cpp
index af70a9980..be7a0bf97 100644
--- a/src/basic/INIT_VIEW1D-Cuda.cpp
+++ b/src/basic/INIT_VIEW1D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace basic
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define INIT_VIEW1D_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(a, m_a, getActualProblemSize());
 
@@ -34,18 +28,22 @@ namespace basic
   getCudaDeviceData(m_a, a, getActualProblemSize()); \
   deallocCudaDeviceData(a);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void initview1d(Real_ptr a,
                            Real_type v,
                            const Index_type iend)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < iend) {
     INIT_VIEW1D_BODY;
   }
 }
 
 
-void INIT_VIEW1D::runCudaVariant(VariantID vid)
+
+template < size_t block_size >
+void INIT_VIEW1D::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -61,7 +59,7 @@ void INIT_VIEW1D::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      initview1d<<<grid_size, block_size>>>( a, v, iend );
+      initview1d<block_size><<<grid_size, block_size>>>( a, v, iend );
       cudaErrchk( cudaGetLastError() );
 
     }
@@ -77,7 +75,7 @@ void INIT_VIEW1D::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      lambda_cuda_forall<<<grid_size, block_size>>>(
+      lambda_cuda_forall<block_size><<<grid_size, block_size>>>(
         ibegin, iend, [=] __device__ (Index_type i) {
         INIT_VIEW1D_BODY;
       });
@@ -108,10 +106,12 @@ void INIT_VIEW1D::runCudaVariant(VariantID vid)
     INIT_VIEW1D_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  INIT_VIEW1D : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  INIT_VIEW1D : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INIT_VIEW1D, Cuda)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/INIT_VIEW1D-Hip.cpp b/src/basic/INIT_VIEW1D-Hip.cpp
index 1136b0d93..6f9d41924 100644
--- a/src/basic/INIT_VIEW1D-Hip.cpp
+++ b/src/basic/INIT_VIEW1D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace basic
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define INIT_VIEW1D_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(a, m_a, iend);
 
@@ -34,18 +28,22 @@ namespace basic
   getHipDeviceData(m_a, a, iend); \
   deallocHipDeviceData(a);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void initview1d(Real_ptr a,
                            Real_type v,
                            const Index_type iend)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < iend) {
     INIT_VIEW1D_BODY;
   }
 }
 
 
-void INIT_VIEW1D::runHipVariant(VariantID vid)
+
+template < size_t block_size >
+void INIT_VIEW1D::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -61,7 +59,7 @@ void INIT_VIEW1D::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((initview1d), dim3(grid_size), dim3(block_size), 0, 0,
+      hipLaunchKernelGGL((initview1d<block_size>), dim3(grid_size), dim3(block_size), 0, 0,
           a, v, iend );
       hipErrchk( hipGetLastError() );
 
@@ -82,7 +80,7 @@ void INIT_VIEW1D::runHipVariant(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL(lambda_hip_forall<decltype(initview1d_lambda)>,
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(initview1d_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, initview1d_lambda);
       hipErrchk( hipGetLastError() );
 
@@ -111,10 +109,12 @@ void INIT_VIEW1D::runHipVariant(VariantID vid)
     INIT_VIEW1D_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  INIT_VIEW1D : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  INIT_VIEW1D : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INIT_VIEW1D, Hip)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/INIT_VIEW1D-OMP.cpp b/src/basic/INIT_VIEW1D-OMP.cpp
index b36e7d44b..a0544574d 100644
--- a/src/basic/INIT_VIEW1D-OMP.cpp
+++ b/src/basic/INIT_VIEW1D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,7 +18,7 @@ namespace basic
 {
 
 
-void INIT_VIEW1D::runOpenMPVariant(VariantID vid)
+void INIT_VIEW1D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -87,12 +87,12 @@ void INIT_VIEW1D::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  INIT_VIEW1D : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  INIT_VIEW1D : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/basic/INIT_VIEW1D-OMPTarget.cpp b/src/basic/INIT_VIEW1D-OMPTarget.cpp
index 705d2fb6e..fba84b747 100644
--- a/src/basic/INIT_VIEW1D-OMPTarget.cpp
+++ b/src/basic/INIT_VIEW1D-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
@@ -37,7 +37,7 @@ namespace basic
   deallocOpenMPDeviceData(a, did);
 
 
-void INIT_VIEW1D::runOpenMPTargetVariant(VariantID vid)
+void INIT_VIEW1D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -47,13 +47,13 @@ void INIT_VIEW1D::runOpenMPTargetVariant(VariantID vid)
 
   if ( vid == Base_OpenMPTarget ) {
 
-    INIT_VIEW1D_DATA_SETUP_OMP_TARGET;                 
+    INIT_VIEW1D_DATA_SETUP_OMP_TARGET;
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       #pragma omp target is_device_ptr(a) device( did )
-      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) 
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
       for (Index_type i = ibegin; i < iend; ++i ) {
         INIT_VIEW1D_BODY;
       }
@@ -83,7 +83,7 @@ void INIT_VIEW1D::runOpenMPTargetVariant(VariantID vid)
      INIT_VIEW1D_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  INIT_VIEW1D : Unknown OMP Targetvariant id = " << vid << std::endl;
+     getCout() << "\n  INIT_VIEW1D : Unknown OMP Targetvariant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/basic/INIT_VIEW1D-Seq.cpp b/src/basic/INIT_VIEW1D-Seq.cpp
index 2cfa4514c..f6df5969b 100644
--- a/src/basic/INIT_VIEW1D-Seq.cpp
+++ b/src/basic/INIT_VIEW1D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,7 +18,7 @@ namespace basic
 {
 
 
-void INIT_VIEW1D::runSeqVariant(VariantID vid)
+void INIT_VIEW1D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -85,7 +85,7 @@ void INIT_VIEW1D::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  INIT_VIEW1D : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  INIT_VIEW1D : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp
index bad47eae8..bd752aa06 100644
--- a/src/basic/INIT_VIEW1D.cpp
+++ b/src/basic/INIT_VIEW1D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -58,18 +58,18 @@ INIT_VIEW1D::~INIT_VIEW1D()
 {
 }
 
-void INIT_VIEW1D::setUp(VariantID vid)
+void INIT_VIEW1D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_a, getActualProblemSize(), 0.0, vid);
   m_val = 0.00000123;
 }
 
-void INIT_VIEW1D::updateChecksum(VariantID vid)
+void INIT_VIEW1D::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_a, getActualProblemSize());
+  checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize());
 }
 
-void INIT_VIEW1D::tearDown(VariantID vid)
+void INIT_VIEW1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_a);
diff --git a/src/basic/INIT_VIEW1D.hpp b/src/basic/INIT_VIEW1D.hpp
index b215439dc..b51d38b79 100644
--- a/src/basic/INIT_VIEW1D.hpp
+++ b/src/basic/INIT_VIEW1D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -57,17 +57,27 @@ class INIT_VIEW1D : public KernelBase
 
   ~INIT_VIEW1D();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_a;
   Real_type m_val;
 };
diff --git a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp
index 95c9e175e..2f7f6d34a 100644
--- a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp
+++ b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace basic
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define INIT_VIEW1D_OFFSET_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(a, m_a, getActualProblemSize());
 
@@ -34,19 +28,23 @@ namespace basic
   getCudaDeviceData(m_a, a, getActualProblemSize()); \
   deallocCudaDeviceData(a);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void initview1d_offset(Real_ptr a,
                                   Real_type v,
                                   const Index_type ibegin,
                                   const Index_type iend)
 {
-  Index_type i = ibegin + blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = ibegin + blockIdx.x * block_size + threadIdx.x;
   if (i < iend) {
     INIT_VIEW1D_OFFSET_BODY;
   }
 }
 
 
-void INIT_VIEW1D_OFFSET::runCudaVariant(VariantID vid)
+
+template < size_t block_size >
+void INIT_VIEW1D_OFFSET::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 1;
@@ -62,7 +60,7 @@ void INIT_VIEW1D_OFFSET::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
-      initview1d_offset<<<grid_size, block_size>>>( a, v,
+      initview1d_offset<block_size><<<grid_size, block_size>>>( a, v,
                                                     ibegin,
                                                     iend );
       cudaErrchk( cudaGetLastError() );
@@ -80,7 +78,7 @@ void INIT_VIEW1D_OFFSET::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
-      lambda_cuda_forall<<<grid_size, block_size>>>(
+      lambda_cuda_forall<block_size><<<grid_size, block_size>>>(
         ibegin, iend, [=] __device__ (Index_type i) {
         INIT_VIEW1D_OFFSET_BODY;
       });
@@ -111,10 +109,12 @@ void INIT_VIEW1D_OFFSET::runCudaVariant(VariantID vid)
     INIT_VIEW1D_OFFSET_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  INIT_VIEW1D_OFFSET : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  INIT_VIEW1D_OFFSET : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INIT_VIEW1D_OFFSET, Cuda)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp
index b2e24a703..ae98f56ab 100644
--- a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp
+++ b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace basic
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define INIT_VIEW1D_OFFSET_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(a, m_a, getActualProblemSize());
 
@@ -34,19 +28,23 @@ namespace basic
   getHipDeviceData(m_a, a, getActualProblemSize()); \
   deallocHipDeviceData(a);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void initview1d_offset(Real_ptr a,
                                   Real_type v,
                                   const Index_type ibegin,
                                   const Index_type iend)
 {
-  Index_type i = ibegin + blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = ibegin + blockIdx.x * block_size + threadIdx.x;
   if (i < iend) {
     INIT_VIEW1D_OFFSET_BODY;
   }
 }
 
 
-void INIT_VIEW1D_OFFSET::runHipVariant(VariantID vid)
+
+template < size_t block_size >
+void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 1;
@@ -62,7 +60,7 @@ void INIT_VIEW1D_OFFSET::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
-      hipLaunchKernelGGL((initview1d_offset), dim3(grid_size), dim3(block_size), 0, 0,
+      hipLaunchKernelGGL((initview1d_offset<block_size>), dim3(grid_size), dim3(block_size), 0, 0,
           a, v, ibegin, iend );
       hipErrchk( hipGetLastError() );
 
@@ -83,7 +81,7 @@ void INIT_VIEW1D_OFFSET::runHipVariant(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
-      hipLaunchKernelGGL(lambda_hip_forall<decltype(initview1d_offset_lambda)>,
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(initview1d_offset_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, initview1d_offset_lambda);
       hipErrchk( hipGetLastError() );
 
@@ -112,10 +110,12 @@ void INIT_VIEW1D_OFFSET::runHipVariant(VariantID vid)
     INIT_VIEW1D_OFFSET_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  INIT_VIEW1D_OFFSET : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  INIT_VIEW1D_OFFSET : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INIT_VIEW1D_OFFSET, Hip)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp
index feb271d31..23a1c4e6f 100644
--- a/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp
+++ b/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
 
 
-void INIT_VIEW1D_OFFSET::runOpenMPVariant(VariantID vid)
+void INIT_VIEW1D_OFFSET::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -87,12 +87,12 @@ void INIT_VIEW1D_OFFSET::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  INIT_VIEW1D_OFFSET : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  INIT_VIEW1D_OFFSET : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp
index e419e7fca..a3091a076 100644
--- a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp
+++ b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
@@ -37,7 +37,7 @@ namespace basic
   deallocOpenMPDeviceData(a, did);
 
 
-void INIT_VIEW1D_OFFSET::runOpenMPTargetVariant(VariantID vid)
+void INIT_VIEW1D_OFFSET::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 1;
@@ -83,7 +83,7 @@ void INIT_VIEW1D_OFFSET::runOpenMPTargetVariant(VariantID vid)
      INIT_VIEW1D_OFFSET_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  INIT_VIEW1D_OFFSET : Unknown OMP Targetvariant id = " << vid << std::endl;
+     getCout() << "\n  INIT_VIEW1D_OFFSET : Unknown OMP Targetvariant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp b/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp
index 12297cdaf..8b4db722b 100644
--- a/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp
+++ b/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
 
 
-void INIT_VIEW1D_OFFSET::runSeqVariant(VariantID vid)
+void INIT_VIEW1D_OFFSET::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 1;
@@ -85,7 +85,7 @@ void INIT_VIEW1D_OFFSET::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  INIT_VIEW1D_OFFSET : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  INIT_VIEW1D_OFFSET : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp
index 06519f61b..165cd5544 100644
--- a/src/basic/INIT_VIEW1D_OFFSET.cpp
+++ b/src/basic/INIT_VIEW1D_OFFSET.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -58,18 +58,18 @@ INIT_VIEW1D_OFFSET::~INIT_VIEW1D_OFFSET()
 {
 }
 
-void INIT_VIEW1D_OFFSET::setUp(VariantID vid)
+void INIT_VIEW1D_OFFSET::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_a, getActualProblemSize(), 0.0, vid);
   m_val = 0.00000123;
 }
 
-void INIT_VIEW1D_OFFSET::updateChecksum(VariantID vid)
+void INIT_VIEW1D_OFFSET::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_a, getActualProblemSize());
+  checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize());
 }
 
-void INIT_VIEW1D_OFFSET::tearDown(VariantID vid)
+void INIT_VIEW1D_OFFSET::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_a);
diff --git a/src/basic/INIT_VIEW1D_OFFSET.hpp b/src/basic/INIT_VIEW1D_OFFSET.hpp
index 333139909..be597496d 100644
--- a/src/basic/INIT_VIEW1D_OFFSET.hpp
+++ b/src/basic/INIT_VIEW1D_OFFSET.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -56,17 +56,27 @@ class INIT_VIEW1D_OFFSET : public KernelBase
 
   ~INIT_VIEW1D_OFFSET();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_a;
   Real_type m_val;
 };
diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp
index a208c3692..0f702b83f 100644
--- a/src/basic/MAT_MAT_SHARED-Cuda.cpp
+++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp
@@ -33,6 +33,8 @@ namespace basic {
   deallocCudaDeviceData(B);                                                    \
   deallocCudaDeviceData(C);
 
+template < Index_type tile_size >
+  __launch_bounds__(tile_size*tile_size)
 __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A,
                                Real_ptr B) {
 
@@ -41,35 +43,39 @@ __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A,
   Index_type bx = blockIdx.x;
   Index_type by = blockIdx.y;
 
-  MAT_MAT_SHARED_BODY_0
+  MAT_MAT_SHARED_BODY_0(tile_size)
 
-  MAT_MAT_SHARED_BODY_1
+  MAT_MAT_SHARED_BODY_1(tile_size)
 
-  for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; k++) {
+  for (Index_type k = 0; k < (tile_size + N - 1) / tile_size; k++) {
 
-    MAT_MAT_SHARED_BODY_2
+    MAT_MAT_SHARED_BODY_2(tile_size)
 
     __syncthreads();
 
-    MAT_MAT_SHARED_BODY_3
+    MAT_MAT_SHARED_BODY_3(tile_size)
 
     __syncthreads();
   }
 
-  MAT_MAT_SHARED_BODY_4
+  MAT_MAT_SHARED_BODY_4(tile_size)
 }
 
-void MAT_MAT_SHARED::runCudaVariant(VariantID vid) {
+template < size_t block_size >
+void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid)
+{
+  constexpr Index_type tile_size = gpu_block_size::sqrt(block_size);
+  static_assert(tile_size*tile_size == block_size, "Invalid block_size");
 
   const Index_type run_reps = getRunReps();
   const Index_type N = m_N;
 
-  dim3 block_size(TL_SZ, TL_SZ);
-  dim3 grid_size(RAJA_DIVIDE_CEILING_INT(N, block_size.x),
-               RAJA_DIVIDE_CEILING_INT(N, block_size.y));
+  dim3 blockDim(tile_size, tile_size);
+  dim3 gridDim(RAJA_DIVIDE_CEILING_INT(N, blockDim.x),
+               RAJA_DIVIDE_CEILING_INT(N, blockDim.y));
 
-  const Index_type Nx = grid_size.x;
-  const Index_type Ny = grid_size.y;
+  const Index_type Nx = gridDim.x;
+  const Index_type Ny = gridDim.y;
 
   MAT_MAT_SHARED_DATA_SETUP;
 
@@ -80,7 +86,7 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) {
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      mat_mat_shared<<<grid_size, block_size>>>(N, C, A, B);
+      mat_mat_shared<tile_size><<<gridDim, blockDim>>>(N, C, A, B);
 
       cudaErrchk( cudaGetLastError() );
     }
@@ -95,60 +101,60 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) {
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      lambda_cuda<<<grid_size, block_size>>>([=] __device__() {
+      lambda_cuda<tile_size*tile_size><<<gridDim, blockDim>>>([=] __device__() {
         auto outer_y = [&](Index_type by) {
           auto outer_x = [&](Index_type bx) {
-            MAT_MAT_SHARED_BODY_0
+            MAT_MAT_SHARED_BODY_0(tile_size)
 
             auto inner_y_1 = [&](Index_type ty) {
-              auto inner_x_1 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_1 };
+              auto inner_x_1 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_1(tile_size) };
 
               {
                 Index_type tx = threadIdx.x;
-                if (tx < TL_SZ)
+                if (tx < tile_size)
                   inner_x_1(tx);
               }
             };
 
             {
               Index_type ty = threadIdx.y;
-              if (ty < TL_SZ)
+              if (ty < tile_size)
                 inner_y_1(ty);
             }
 
-            for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; ++k) {
+            for (Index_type k = 0; k < (tile_size + N - 1) / tile_size; ++k) {
 
               auto inner_y_2 = [&](Index_type ty) {
-                auto inner_x_2 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_2 };
+                auto inner_x_2 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_2(tile_size) };
 
                 {
                   Index_type tx = threadIdx.x;
-                  if (tx < TL_SZ)
+                  if (tx < tile_size)
                     inner_x_2(tx);
                 }
               };
 
               {
                 Index_type ty = threadIdx.y;
-                if (ty < TL_SZ)
+                if (ty < tile_size)
                   inner_y_2(ty);
               }
 
               __syncthreads();
 
               auto inner_y_3 = [&](Index_type ty) {
-                auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3 };
+                auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3(tile_size) };
 
                 {
                   Index_type tx = threadIdx.x;
-                  if (tx < TL_SZ)
+                  if (tx < tile_size)
                     inner_x_3(tx);
                 }
               };
 
               {
                 Index_type ty = threadIdx.y;
-                if (ty < TL_SZ)
+                if (ty < tile_size)
                   inner_y_3(ty);
               }
 
@@ -156,18 +162,18 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) {
             }
 
             auto inner_y_4 = [&](Index_type ty) {
-              auto inner_x_4 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_4 };
+              auto inner_x_4 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_4(tile_size) };
 
               {
                 Index_type tx = threadIdx.x;
-                if (tx < TL_SZ)
+                if (tx < tile_size)
                   inner_x_4(tx);
               }
             };
 
             {
               Index_type ty = threadIdx.y;
-              if (ty < TL_SZ)
+              if (ty < tile_size)
                 inner_y_4(ty);
             }
           }; // outer_x
@@ -194,61 +200,51 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) {
 
     MAT_MAT_SHARED_DATA_SETUP_CUDA;
 
-    using launch_policy = RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t
-                                                   ,RAJA::expt::cuda_launch_t<true>
-                                                   >;
+    constexpr bool async = true;
 
-    using teams_x = RAJA::expt::LoopPolicy<RAJA::loop_exec
-                                           ,RAJA::cuda_block_x_direct
-                                           >;
+    using launch_policy = RAJA::expt::LaunchPolicy<RAJA::expt::cuda_launch_t<async, tile_size*tile_size>>;
 
-    using teams_y = RAJA::expt::LoopPolicy<RAJA::loop_exec
-                                           ,RAJA::cuda_block_y_direct
-                                           >;
+    using teams_x = RAJA::expt::LoopPolicy<RAJA::cuda_block_x_direct>;
 
-    using threads_x = RAJA::expt::LoopPolicy<RAJA::loop_exec
-                                             ,RAJA::cuda_thread_x_direct
-                                             >;
+    using teams_y = RAJA::expt::LoopPolicy<RAJA::cuda_block_y_direct>;
 
-    using threads_y = RAJA::expt::LoopPolicy<RAJA::loop_exec
-                                             ,RAJA::cuda_thread_y_direct
-                                             >;
+    using threads_x = RAJA::expt::LoopPolicy<RAJA::cuda_thread_x_direct>;
 
+    using threads_y = RAJA::expt::LoopPolicy<RAJA::cuda_thread_y_direct>;
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       RAJA::expt::launch<launch_policy>(
-        RAJA::expt::DEVICE,
         RAJA::expt::Grid(RAJA::expt::Teams(Nx, Ny),
-                         RAJA::expt::Threads(TL_SZ, TL_SZ)),
+                         RAJA::expt::Threads(tile_size, tile_size)),
         [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
 
-          RAJA::expt::loop<teams_y>(ctx, RAJA::RangeSegment(0, Ny), 
+          RAJA::expt::loop<teams_y>(ctx, RAJA::RangeSegment(0, Ny),
             [&](Index_type by) {
-              RAJA::expt::loop<teams_x>(ctx, RAJA::RangeSegment(0, Nx), 
+              RAJA::expt::loop<teams_x>(ctx, RAJA::RangeSegment(0, Nx),
                 [&](Index_type bx) {
 
-                  MAT_MAT_SHARED_BODY_0
+                  MAT_MAT_SHARED_BODY_0(tile_size)
 
-                  RAJA::expt::loop<threads_y>(ctx, RAJA::RangeSegment(0, TL_SZ), 
+                  RAJA::expt::loop<threads_y>(ctx, RAJA::RangeSegment(0, tile_size),
                     [&](Index_type ty) {
-                      RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, TL_SZ), 
+                      RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, tile_size),
                         [&](Index_type tx) {
-                          MAT_MAT_SHARED_BODY_1
+                          MAT_MAT_SHARED_BODY_1(tile_size)
                         }
                       );  // RAJA::expt::loop<threads_x>
-                    } 
+                    }
                   );  // RAJA::expt::loop<threads_y>
 
-                  for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; k++) {
- 
-                    RAJA::expt::loop<threads_y>(ctx, RAJA::RangeSegment(0, TL_SZ),
+                  for (Index_type k = 0; k < (tile_size + N - 1) / tile_size; k++) {
+
+                    RAJA::expt::loop<threads_y>(ctx, RAJA::RangeSegment(0, tile_size),
                       [&](Index_type ty) {
-                        RAJA::expt::loop<threads_x>(ctx, 
-                                                    RAJA::RangeSegment(0, TL_SZ), 
+                        RAJA::expt::loop<threads_x>(ctx,
+                                                    RAJA::RangeSegment(0, tile_size),
                           [&](Index_type tx) {
-                            MAT_MAT_SHARED_BODY_2
+                            MAT_MAT_SHARED_BODY_2(tile_size)
                           }
                         ); // RAJA::expt::loop<threads_x>
                       }
@@ -256,11 +252,11 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) {
 
                     ctx.teamSync();
 
-                    RAJA::expt::loop<threads_y>(ctx, RAJA::RangeSegment(0, TL_SZ),
+                    RAJA::expt::loop<threads_y>(ctx, RAJA::RangeSegment(0, tile_size),
                       [&](Index_type ty) {
-                        RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, TL_SZ), 
+                        RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, tile_size),
                           [&](Index_type tx) {
-                            MAT_MAT_SHARED_BODY_3
+                            MAT_MAT_SHARED_BODY_3(tile_size)
                           }
                         );  // RAJA::expt::loop<threads_x>
                       }
@@ -270,11 +266,11 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) {
 
                   }  // for (k)
 
-                  RAJA::expt::loop<threads_y>(ctx, RAJA::RangeSegment(0, TL_SZ), 
+                  RAJA::expt::loop<threads_y>(ctx, RAJA::RangeSegment(0, tile_size),
                     [&](Index_type ty) {
-                      RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, TL_SZ),
+                      RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, tile_size),
                         [&](Index_type tx) {
-                          MAT_MAT_SHARED_BODY_4
+                          MAT_MAT_SHARED_BODY_4(tile_size)
                         }
                       );  // RAJA::expt::loop<threads_x>
                     }
@@ -294,11 +290,13 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) {
     MAT_MAT_SHARED_DATA_TEARDOWN_CUDA;
 
   } else {
-    std::cout << "\n  MAT_MAT_SHARED : Unknown Cuda variant id = " << vid
+    getCout() << "\n  MAT_MAT_SHARED : Unknown Cuda variant id = " << vid
               << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MAT_MAT_SHARED, Cuda)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp
index d4ea505e5..ac1b4fb7f 100644
--- a/src/basic/MAT_MAT_SHARED-Hip.cpp
+++ b/src/basic/MAT_MAT_SHARED-Hip.cpp
@@ -33,6 +33,8 @@ namespace basic {
   deallocHipDeviceData(B);                                                     \
   deallocHipDeviceData(C);
 
+template < Index_type tile_size >
+  __launch_bounds__(tile_size*tile_size)
 __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A,
                                Real_ptr B) {
 
@@ -41,35 +43,39 @@ __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A,
   Index_type bx = blockIdx.x;
   Index_type by = blockIdx.y;
 
-  MAT_MAT_SHARED_BODY_0
+  MAT_MAT_SHARED_BODY_0(tile_size)
 
-  MAT_MAT_SHARED_BODY_1
+  MAT_MAT_SHARED_BODY_1(tile_size)
 
-  for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; k++) {
+  for (Index_type k = 0; k < (tile_size + N - 1) / tile_size; k++) {
 
-    MAT_MAT_SHARED_BODY_2
+    MAT_MAT_SHARED_BODY_2(tile_size)
 
     __syncthreads();
 
-    MAT_MAT_SHARED_BODY_3
+    MAT_MAT_SHARED_BODY_3(tile_size)
 
     __syncthreads();
   }
 
-  MAT_MAT_SHARED_BODY_4
+  MAT_MAT_SHARED_BODY_4(tile_size)
 }
 
-void MAT_MAT_SHARED::runHipVariant(VariantID vid) {
+template < size_t block_size >
+void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid)
+{
+  constexpr Index_type tile_size = gpu_block_size::sqrt(block_size);
+  static_assert(tile_size*tile_size == block_size, "Invalid block_size");
 
   const Index_type run_reps = getRunReps();
   const Index_type N = m_N;
 
-  dim3 block_size(TL_SZ, TL_SZ);
-  dim3 grid_size(RAJA_DIVIDE_CEILING_INT(N, block_size.x),
-               RAJA_DIVIDE_CEILING_INT(N, block_size.y));
+  dim3 blockDim(tile_size, tile_size);
+  dim3 gridDim(RAJA_DIVIDE_CEILING_INT(N, blockDim.x),
+               RAJA_DIVIDE_CEILING_INT(N, blockDim.y));
 
-  const Index_type Nx = grid_size.x;
-  const Index_type Ny = grid_size.y;
+  const Index_type Nx = gridDim.x;
+  const Index_type Ny = gridDim.y;
 
   MAT_MAT_SHARED_DATA_SETUP;
 
@@ -80,7 +86,7 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) {
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      hipLaunchKernelGGL((mat_mat_shared), dim3(grid_size), dim3(block_size), 0, 0,
+      hipLaunchKernelGGL((mat_mat_shared<tile_size>), dim3(gridDim), dim3(blockDim), 0, 0,
                          N, C, A, B);
 
       hipErrchk( hipGetLastError() );
@@ -100,57 +106,57 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) {
 
         auto outer_y = [&](Index_type by) {
           auto outer_x = [&](Index_type bx) {
-            MAT_MAT_SHARED_BODY_0
+            MAT_MAT_SHARED_BODY_0(tile_size)
 
             auto inner_y_1 = [&](Index_type ty) {
-              auto inner_x_1 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_1 };
+              auto inner_x_1 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_1(tile_size) };
 
               {
                 Index_type tx = threadIdx.x;
-                if (tx < TL_SZ)
+                if (tx < tile_size)
                   inner_x_1(tx);
               }
             };
 
             {
               Index_type ty = threadIdx.y;
-              if (ty < TL_SZ)
+              if (ty < tile_size)
                 inner_y_1(ty);
             }
 
-            for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; ++k) {
+            for (Index_type k = 0; k < (tile_size + N - 1) / tile_size; ++k) {
 
               auto inner_y_2 = [&](Index_type ty) {
-                auto inner_x_2 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_2 };
+                auto inner_x_2 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_2(tile_size) };
 
                 {
                   Index_type tx = threadIdx.x;
-                  if (tx < TL_SZ)
+                  if (tx < tile_size)
                     inner_x_2(tx);
                 }
               };
 
               {
                 Index_type ty = threadIdx.y;
-                if (ty < TL_SZ)
+                if (ty < tile_size)
                   inner_y_2(ty);
               }
 
               __syncthreads();
 
               auto inner_y_3 = [&](Index_type ty) {
-                auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3 };
+                auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3(tile_size) };
 
                 {
                   Index_type tx = threadIdx.x;
-                  if (tx < TL_SZ)
+                  if (tx < tile_size)
                     inner_x_3(tx);
                 }
               };
 
               {
                 Index_type ty = threadIdx.y;
-                if (ty < TL_SZ)
+                if (ty < tile_size)
                   inner_y_3(ty);
               }
 
@@ -158,18 +164,18 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) {
             }
 
             auto inner_y_4 = [&](Index_type ty) {
-              auto inner_x_4 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_4 };
+              auto inner_x_4 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_4(tile_size) };
 
               {
                 Index_type tx = threadIdx.x;
-                if (tx < TL_SZ)
+                if (tx < tile_size)
                   inner_x_4(tx);
               }
             };
 
             {
               Index_type ty = threadIdx.y;
-              if (ty < TL_SZ)
+              if (ty < tile_size)
                 inner_y_4(ty);
             }
           }; // outer_x
@@ -186,8 +192,8 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) {
         }
       };
 
-      hipLaunchKernelGGL(lambda_hip<decltype(mat_mat_shared_lam)>,
-        grid_size, block_size, 0, 0, mat_mat_shared_lam);
+      hipLaunchKernelGGL((lambda_hip<tile_size*tile_size, decltype(mat_mat_shared_lam)>),
+        gridDim, blockDim, 0, 0, mat_mat_shared_lam);
 
       hipErrchk( hipGetLastError() );
     }
@@ -199,33 +205,24 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) {
 
     MAT_MAT_SHARED_DATA_SETUP_HIP;
 
-    using launch_policy = RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t
-                                                   ,RAJA::expt::hip_launch_t<true>
-                                                   >;
+    constexpr bool async = true;
 
-    using teams_x = RAJA::expt::LoopPolicy<RAJA::loop_exec
-                                           ,RAJA::hip_block_x_direct
-                                           >;
+    using launch_policy = RAJA::expt::LaunchPolicy<RAJA::expt::hip_launch_t<async, tile_size*tile_size>>;
 
-    using teams_y = RAJA::expt::LoopPolicy<RAJA::loop_exec
-                                           ,RAJA::hip_block_y_direct
-                                           >;
+    using teams_x = RAJA::expt::LoopPolicy<RAJA::hip_block_x_direct>;
 
-    using threads_x = RAJA::expt::LoopPolicy<RAJA::loop_exec
-                                             ,RAJA::hip_thread_x_direct
-                                             >;
+    using teams_y = RAJA::expt::LoopPolicy<RAJA::hip_block_y_direct>;
 
-    using threads_y = RAJA::expt::LoopPolicy<RAJA::loop_exec
-                                             ,RAJA::hip_thread_y_direct
-                                             >;
+    using threads_x = RAJA::expt::LoopPolicy<RAJA::hip_thread_x_direct>;
+
+    using threads_y = RAJA::expt::LoopPolicy<RAJA::hip_thread_y_direct>;
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       RAJA::expt::launch<launch_policy>(
-        RAJA::expt::DEVICE,
         RAJA::expt::Grid(RAJA::expt::Teams(Nx, Ny),
-                         RAJA::expt::Threads(TL_SZ, TL_SZ)),
+                         RAJA::expt::Threads(tile_size, tile_size)),
         [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
 
           RAJA::expt::loop<teams_y>(ctx, RAJA::RangeSegment(0, Ny),
@@ -233,25 +230,25 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) {
               RAJA::expt::loop<teams_x>(ctx, RAJA::RangeSegment(0, Nx),
                 [&](Index_type bx) {
 
-                  MAT_MAT_SHARED_BODY_0
+                  MAT_MAT_SHARED_BODY_0(tile_size)
 
-                  RAJA::expt::loop<threads_y>(ctx, RAJA::RangeSegment(0, TL_SZ),
+                  RAJA::expt::loop<threads_y>(ctx, RAJA::RangeSegment(0, tile_size),
                     [&](Index_type ty) {
-                      RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, TL_SZ),
+                      RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, tile_size),
                         [&](Index_type tx) {
-                          MAT_MAT_SHARED_BODY_1
+                          MAT_MAT_SHARED_BODY_1(tile_size)
                         }
                       );  // RAJA::expt::loop<threads_x>
                     }
                   );  // RAJA::expt::loop<threads_y>
 
-                  for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; k++) {
+                  for (Index_type k = 0; k < (tile_size + N - 1) / tile_size; k++) {
 
-                    RAJA::expt::loop<threads_y>(ctx, RAJA::RangeSegment(0, TL_SZ),
+                    RAJA::expt::loop<threads_y>(ctx, RAJA::RangeSegment(0, tile_size),
                       [&](Index_type ty) {
-                        RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, TL_SZ),
+                        RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, tile_size),
                           [&](Index_type tx) {
-                            MAT_MAT_SHARED_BODY_2
+                            MAT_MAT_SHARED_BODY_2(tile_size)
                           }
                         );  // RAJA::expt::loop<threads_x>
                       }
@@ -259,25 +256,25 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) {
 
                     ctx.teamSync();
 
-                    RAJA::expt::loop<threads_y>(ctx, RAJA::RangeSegment(0, TL_SZ),
+                    RAJA::expt::loop<threads_y>(ctx, RAJA::RangeSegment(0, tile_size),
                       [&](Index_type ty) {
-                        RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, TL_SZ),
+                        RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, tile_size),
                           [&](Index_type tx) {
-                            MAT_MAT_SHARED_BODY_3
-                          } 
+                            MAT_MAT_SHARED_BODY_3(tile_size)
+                          }
                         );  // RAJA::expt::loop<threads_x>
                       }
                     );  // RAJA::expt::loop<threads_y>
 
                     ctx.teamSync();
-                
+
                   }  // for (k)
 
-                  RAJA::expt::loop<threads_y>(ctx, RAJA::RangeSegment(0, TL_SZ),
+                  RAJA::expt::loop<threads_y>(ctx, RAJA::RangeSegment(0, tile_size),
                     [&](Index_type ty) {
-                      RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, TL_SZ),
+                      RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, tile_size),
                         [&](Index_type tx) {
-                          MAT_MAT_SHARED_BODY_4
+                          MAT_MAT_SHARED_BODY_4(tile_size)
                         }
                       );  // RAJA::expt::loop<threads_x>
                     }
@@ -297,11 +294,13 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) {
     MAT_MAT_SHARED_DATA_TEARDOWN_HIP;
 
   } else {
-    std::cout << "\n  MAT_MAT_SHARED : Unknown Hip variant id = " << vid
+    getCout() << "\n  MAT_MAT_SHARED : Unknown Hip variant id = " << vid
               << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MAT_MAT_SHARED, Hip)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/MAT_MAT_SHARED-OMP.cpp b/src/basic/MAT_MAT_SHARED-OMP.cpp
index c120646f6..484550704 100644
--- a/src/basic/MAT_MAT_SHARED-OMP.cpp
+++ b/src/basic/MAT_MAT_SHARED-OMP.cpp
@@ -15,7 +15,7 @@
 namespace rajaperf {
 namespace basic {
 
-void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) {
+void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
   const Index_type run_reps = getRunReps();
@@ -39,11 +39,11 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) {
         for (Index_type by = 0; by < Ny; ++by) {
           for (Index_type bx = 0; bx < Nx; ++bx) {
 
-            MAT_MAT_SHARED_BODY_0
+            MAT_MAT_SHARED_BODY_0(TL_SZ)
 
             for (Index_type ty = 0; ty < TL_SZ; ++ty) {
               for (Index_type tx = 0; tx < TL_SZ; ++tx) {
-                MAT_MAT_SHARED_BODY_1
+                MAT_MAT_SHARED_BODY_1(TL_SZ)
               }
             }
 
@@ -52,21 +52,21 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) {
               for (Index_type ty = 0; ty < TL_SZ; ++ty) {
                 for (Index_type tx = 0; tx < TL_SZ; ++tx) {
 
-                  MAT_MAT_SHARED_BODY_2
+                  MAT_MAT_SHARED_BODY_2(TL_SZ)
                 }
               }
 
               for (Index_type ty = 0; ty < TL_SZ; ++ty) {
                 for (Index_type tx = 0; tx < TL_SZ; ++tx) {
 
-                  MAT_MAT_SHARED_BODY_3
+                  MAT_MAT_SHARED_BODY_3(TL_SZ)
                 }
               }
             }
 
             for (Index_type ty = 0; ty < TL_SZ; ++ty) {
               for (Index_type tx = 0; tx < TL_SZ; ++tx) {
-                MAT_MAT_SHARED_BODY_4
+                MAT_MAT_SHARED_BODY_4(TL_SZ)
               }
             }
           }
@@ -85,10 +85,10 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) {
 
       auto outer_y = [&](Index_type by) {
         auto outer_x = [&](Index_type bx) {
-          MAT_MAT_SHARED_BODY_0
+          MAT_MAT_SHARED_BODY_0(TL_SZ)
 
           auto inner_y_1 = [&](Index_type ty) {
-            auto inner_x_1 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_1 };
+            auto inner_x_1 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_1(TL_SZ) };
 
             for (Index_type tx = 0; tx < TL_SZ; ++tx) {
               if (tx < TL_SZ)
@@ -104,7 +104,7 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) {
           for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; ++k) {
 
             auto inner_y_2 = [&](Index_type ty) {
-              auto inner_x_2 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_2 };
+              auto inner_x_2 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_2(TL_SZ) };
 
               for (Index_type tx = 0; tx < TL_SZ; ++tx) {
                 inner_x_2(tx);
@@ -116,7 +116,7 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) {
             }
 
             auto inner_y_3 = [&](Index_type ty) {
-              auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3 };
+              auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3(TL_SZ) };
 
               for (Index_type tx = 0; tx < TL_SZ; ++tx) {
                 inner_x_3(tx);
@@ -129,7 +129,7 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) {
           }
 
           auto inner_y_4 = [&](Index_type ty) {
-            auto inner_x_4 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_4 };
+            auto inner_x_4 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_4(TL_SZ) };
 
             for (Index_type tx = 0; tx < TL_SZ; ++tx) {
               inner_x_4(tx);
@@ -159,55 +159,35 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) {
   case RAJA_OpenMP: {
 
     //Currently Teams requires two policies if compiled with a device
-    using launch_policy = RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t
-#if defined(RAJA_DEVICE_ACTIVE)
-                                                   ,mms_device_launch
-#endif
-                                                   >;
+    using launch_policy = RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t>;
 
-    using outer_x = RAJA::expt::LoopPolicy<RAJA::omp_for_exec
-#if defined(RAJA_DEVICE_ACTIVE)
-                                           ,mms_gpu_block_x_policy
-#endif
-                                           >;
+    using outer_x = RAJA::expt::LoopPolicy<RAJA::omp_for_exec>;
 
-    using outer_y = RAJA::expt::LoopPolicy<RAJA::loop_exec
-#if defined(RAJA_DEVICE_ACTIVE)
-                                           ,mms_gpu_block_y_policy
-#endif
-                                           >;
+    using outer_y = RAJA::expt::LoopPolicy<RAJA::loop_exec>;
 
-    using inner_x = RAJA::expt::LoopPolicy<RAJA::loop_exec
-#if defined(RAJA_DEVICE_ACTIVE)
-                                             ,mms_gpu_thread_x_policy
-#endif
-                                             >;
+    using inner_x = RAJA::expt::LoopPolicy<RAJA::loop_exec>;
 
-    using inner_y = RAJA::expt::LoopPolicy<RAJA::loop_exec
-#if defined(RAJA_DEVICE_ACTIVE)
-                                             ,mms_gpu_thread_y_policy
-#endif
-                                             >;
+    using inner_y = RAJA::expt::LoopPolicy<RAJA::loop_exec>;
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       //Grid is empty as the host does not need a compute grid to be specified
-      RAJA::expt::launch<launch_policy>(RAJA::expt::HOST, RAJA::expt::Grid(),
+      RAJA::expt::launch<launch_policy>(RAJA::expt::Grid(),
         [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
 
-          RAJA::expt::loop<outer_y>(ctx, RAJA::RangeSegment(0, Ny), 
+          RAJA::expt::loop<outer_y>(ctx, RAJA::RangeSegment(0, Ny),
             [&](Index_type by) {
               RAJA::expt::loop<outer_x>(ctx, RAJA::RangeSegment(0, Nx),
                 [&](Index_type bx) {
 
-                  MAT_MAT_SHARED_BODY_0
+                  MAT_MAT_SHARED_BODY_0(TL_SZ)
 
                   RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, TL_SZ),
                     [&](Index_type ty) {
                       RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, TL_SZ),
                         [&](Index_type tx) {
-                          MAT_MAT_SHARED_BODY_1
+                          MAT_MAT_SHARED_BODY_1(TL_SZ)
                         }
                       );  // RAJA::expt::loop<inner_x>
                     }
@@ -219,7 +199,7 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) {
                       [&](Index_type ty) {
                         RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, TL_SZ),
                           [&](Index_type tx) {
-                            MAT_MAT_SHARED_BODY_2
+                            MAT_MAT_SHARED_BODY_2(TL_SZ)
                           }
                         );  // RAJA::expt::loop<inner_x>
                       }
@@ -231,7 +211,7 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) {
                       [&](Index_type ty) {
                         RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, TL_SZ),
                           [&](Index_type tx) {
-                            MAT_MAT_SHARED_BODY_3
+                            MAT_MAT_SHARED_BODY_3(TL_SZ)
                           }
                         );  // RAJA::expt::loop<inner_x>
                       }
@@ -245,7 +225,7 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) {
                     [&](Index_type ty) {
                       RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, TL_SZ),
                         [&](Index_type tx) {
-                          MAT_MAT_SHARED_BODY_4
+                          MAT_MAT_SHARED_BODY_4(TL_SZ)
                         }
                       );  // RAJA::expt::loop<inner_x>
                     }
@@ -253,25 +233,25 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) {
 
                 }  // lambda (bx)
               );  // RAJA::expt::loop<outer_x>
-            }  // lambda (by) 
+            }  // lambda (by)
           );  // RAJA::expt::loop<outer_y>
 
         }  // outer lambda (ctx)
-      );  // RAJA::expt::launch 
+      );  // RAJA::expt::launch
 
-    }  // loop over kernel reps 
+    }  // loop over kernel reps
     stopTimer();
 
     break;
   }
 
   default: {
-    std::cout << "\n  MAT_MAT_SHARED : Unknown variant id = " << vid
+    getCout() << "\n  MAT_MAT_SHARED : Unknown variant id = " << vid
               << std::endl;
   }
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/basic/MAT_MAT_SHARED-OMPTarget.cpp b/src/basic/MAT_MAT_SHARED-OMPTarget.cpp
index 41925870d..6dac3ee94 100644
--- a/src/basic/MAT_MAT_SHARED-OMPTarget.cpp
+++ b/src/basic/MAT_MAT_SHARED-OMPTarget.cpp
@@ -20,14 +20,14 @@ namespace rajaperf {
 namespace basic {
 
 
-  void MAT_MAT_SHARED::runOpenMPTargetVariant(VariantID vid) {
+  void MAT_MAT_SHARED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
     const Index_type run_reps = getRunReps();
 
     switch (vid) {
 
     default: {
 
-      std::cout << "\n MAT_MAT_SHARED : Unknown OpenMPTarget variant id = " << vid << std::endl;
+      getCout() << "\n MAT_MAT_SHARED : Unknown OpenMPTarget variant id = " << vid << std::endl;
       break;
     }
     }
diff --git a/src/basic/MAT_MAT_SHARED-Seq.cpp b/src/basic/MAT_MAT_SHARED-Seq.cpp
index 00119d1b3..b412daa32 100644
--- a/src/basic/MAT_MAT_SHARED-Seq.cpp
+++ b/src/basic/MAT_MAT_SHARED-Seq.cpp
@@ -13,7 +13,7 @@
 namespace rajaperf {
 namespace basic {
 
-void MAT_MAT_SHARED::runSeqVariant(VariantID vid) {
+void MAT_MAT_SHARED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
 
   const Index_type run_reps = getRunReps();
   const Index_type N = m_N;
@@ -34,11 +34,11 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) {
 
           //Work around for when compiling with CLANG and HIP
           //See notes in MAT_MAT_SHARED.hpp
-          MAT_MAT_SHARED_BODY_0_CLANG_HIP_CPU
+          MAT_MAT_SHARED_BODY_0_CLANG_HIP_CPU(TL_SZ)
 
           for (Index_type ty = 0; ty < TL_SZ; ++ty) {
             for (Index_type tx = 0; tx < TL_SZ; ++tx) {
-              MAT_MAT_SHARED_BODY_1
+              MAT_MAT_SHARED_BODY_1(TL_SZ)
             }
           }
 
@@ -46,13 +46,13 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) {
 
             for (Index_type ty = 0; ty < TL_SZ; ++ty) {
               for (Index_type tx = 0; tx < TL_SZ; ++tx) {
-                MAT_MAT_SHARED_BODY_2
+                MAT_MAT_SHARED_BODY_2(TL_SZ)
               }
             }
 
             for (Index_type ty = 0; ty < TL_SZ; ++ty) {
               for (Index_type tx = 0; tx < TL_SZ; ++tx) {
-                MAT_MAT_SHARED_BODY_3
+                MAT_MAT_SHARED_BODY_3(TL_SZ)
               }
             }
 
@@ -60,7 +60,7 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) {
 
           for (Index_type ty = 0; ty < TL_SZ; ++ty) {
             for (Index_type tx = 0; tx < TL_SZ; ++tx) {
-              MAT_MAT_SHARED_BODY_4
+              MAT_MAT_SHARED_BODY_4(TL_SZ)
             }
           }
         }
@@ -82,10 +82,10 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) {
       auto outer_y = [&](Index_type by) {
         auto outer_x = [&](Index_type bx) {
 
-          MAT_MAT_SHARED_BODY_0_CLANG_HIP_CPU
+          MAT_MAT_SHARED_BODY_0_CLANG_HIP_CPU(TL_SZ)
 
           auto inner_y_1 = [&](Index_type ty) {
-            auto inner_x_1 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_1 };
+            auto inner_x_1 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_1(TL_SZ) };
 
             for (Index_type tx = 0; tx < TL_SZ; ++tx) {
               if (tx < TL_SZ)
@@ -101,7 +101,7 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) {
           for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; ++k) {
 
             auto inner_y_2 = [&](Index_type ty) {
-              auto inner_x_2 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_2 };
+              auto inner_x_2 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_2(TL_SZ) };
 
               for (Index_type tx = 0; tx < TL_SZ; ++tx) {
                 inner_x_2(tx);
@@ -113,7 +113,7 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) {
             }
 
             auto inner_y_3 = [&](Index_type ty) {
-              auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3 };
+              auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3(TL_SZ) };
 
               for (Index_type tx = 0; tx < TL_SZ; ++tx) {
                 inner_x_3(tx);
@@ -126,7 +126,7 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) {
           }
 
           auto inner_y_4 = [&](Index_type ty) {
-            auto inner_x_4 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_4 };
+            auto inner_x_4 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_4(TL_SZ) };
 
             for (Index_type tx = 0; tx < TL_SZ; ++tx) {
               inner_x_4(tx);
@@ -155,57 +155,36 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) {
 
   case RAJA_Seq: {
 
-    //Currently Teams requires two policies if compiled with a device
-    using launch_policy = RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t
-#if defined(RAJA_DEVICE_ACTIVE)
-                                                   ,mms_device_launch
-#endif
-                                                   >;
-
-    using outer_x = RAJA::expt::LoopPolicy<RAJA::loop_exec
-#if defined(RAJA_DEVICE_ACTIVE)
-                                           ,mms_gpu_block_x_policy
-#endif
-                                           >;
-
-    using outer_y = RAJA::expt::LoopPolicy<RAJA::loop_exec
-#if defined(RAJA_DEVICE_ACTIVE)
-                                           ,mms_gpu_block_y_policy
-#endif
-                                           >;
-
-    using inner_x = RAJA::expt::LoopPolicy<RAJA::loop_exec
-#if defined(RAJA_DEVICE_ACTIVE)
-                                             ,mms_gpu_thread_x_policy
-#endif
-                                             >;
-
-    using inner_y = RAJA::expt::LoopPolicy<RAJA::loop_exec
-#if defined(RAJA_DEVICE_ACTIVE)
-                                             ,mms_gpu_thread_y_policy
-#endif
-                                             >;
+    using launch_policy = RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t>;
+
+    using outer_x = RAJA::expt::LoopPolicy<RAJA::loop_exec>;
+
+    using outer_y = RAJA::expt::LoopPolicy<RAJA::loop_exec>;
+
+    using inner_x = RAJA::expt::LoopPolicy<RAJA::loop_exec>;
+
+    using inner_y = RAJA::expt::LoopPolicy<RAJA::loop_exec>;
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       //Grid is empty as the host does not need a compute grid to be specified
-      RAJA::expt::launch<launch_policy>(RAJA::expt::HOST, RAJA::expt::Grid(),
+      RAJA::expt::launch<launch_policy>(RAJA::expt::Grid(),
         [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
 
-          RAJA::expt::loop<outer_y>(ctx, RAJA::RangeSegment(0, Ny), 
+          RAJA::expt::loop<outer_y>(ctx, RAJA::RangeSegment(0, Ny),
             [&](Index_type by) {
-              RAJA::expt::loop<outer_x>(ctx, RAJA::RangeSegment(0, Nx), 
+              RAJA::expt::loop<outer_x>(ctx, RAJA::RangeSegment(0, Nx),
                 [&](Index_type bx) {
 
-                  MAT_MAT_SHARED_BODY_0
+                  MAT_MAT_SHARED_BODY_0(TL_SZ)
 
                   RAJA::expt::loop<inner_y>(ctx, RAJA::RangeSegment(0, TL_SZ),
 
                     [&](Index_type ty) {
-                      RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, TL_SZ), 
+                      RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, TL_SZ),
                         [&](Index_type tx) {
-                          MAT_MAT_SHARED_BODY_1
+                          MAT_MAT_SHARED_BODY_1(TL_SZ)
                         }
                       );  // RAJA::expt::loop<inner_x>
                     }
@@ -217,7 +196,7 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) {
                       [&](Index_type ty) {
                         RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, TL_SZ),
                           [&](Index_type tx) {
-                            MAT_MAT_SHARED_BODY_2
+                            MAT_MAT_SHARED_BODY_2(TL_SZ)
                           }
                         );  // RAJA::expt::loop<inner_x>
                       }
@@ -229,9 +208,9 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) {
                       [&](Index_type ty) {
                         RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, TL_SZ),
                           [&](Index_type tx) {
-                            MAT_MAT_SHARED_BODY_3
+                            MAT_MAT_SHARED_BODY_3(TL_SZ)
                           }
-                        );  // RAJA::expt::loop<inner_x> 
+                        );  // RAJA::expt::loop<inner_x>
                       }
                     );  // RAJA::expt::loop<inner_y>
 
@@ -243,7 +222,7 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) {
                     [&](Index_type ty) {
                       RAJA::expt::loop<inner_x>(ctx, RAJA::RangeSegment(0, TL_SZ),
                         [&](Index_type tx) {
-                          MAT_MAT_SHARED_BODY_4
+                          MAT_MAT_SHARED_BODY_4(TL_SZ)
                         }
                       );  // RAJA::expt::loop<inner_x>
                     }
@@ -265,7 +244,7 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) {
 #endif // RUN_RAJA_SEQ
 
   default: {
-    std::cout << "\n  MAT_MAT_SHARED : Unknown variant id = " << vid
+    getCout() << "\n  MAT_MAT_SHARED : Unknown variant id = " << vid
               << std::endl;
   }
   }
diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp
index dd21012e6..98cd878ce 100644
--- a/src/basic/MAT_MAT_SHARED.cpp
+++ b/src/basic/MAT_MAT_SHARED.cpp
@@ -18,9 +18,8 @@ namespace rajaperf {
 namespace basic {
 
 MAT_MAT_SHARED::MAT_MAT_SHARED(const RunParams &params)
-    : KernelBase(rajaperf::Basic_MAT_MAT_SHARED, params) 
+    : KernelBase(rajaperf::Basic_MAT_MAT_SHARED, params)
 {
-
   m_N_default = 1000;
   setDefaultProblemSize(m_N_default*m_N_default);
   setDefaultReps(5);
@@ -65,7 +64,7 @@ MAT_MAT_SHARED::MAT_MAT_SHARED(const RunParams &params)
 
 MAT_MAT_SHARED::~MAT_MAT_SHARED() {}
 
-void MAT_MAT_SHARED::setUp(VariantID vid) {
+void MAT_MAT_SHARED::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
   const Index_type NN = m_N * m_N;
 
   allocAndInitDataConst(m_A, NN, 1.0, vid);
@@ -73,11 +72,11 @@ void MAT_MAT_SHARED::setUp(VariantID vid) {
   allocAndInitDataConst(m_C, NN, 0.0, vid);
 }
 
-void MAT_MAT_SHARED::updateChecksum(VariantID vid) {
-  checksum[vid] += calcChecksum(m_C, m_N*m_N, checksum_scale_factor );
+void MAT_MAT_SHARED::updateChecksum(VariantID vid, size_t tune_idx) {
+  checksum[vid][tune_idx] += calcChecksum(m_C, m_N*m_N, checksum_scale_factor );
 }
 
-void MAT_MAT_SHARED::tearDown(VariantID vid) {
+void MAT_MAT_SHARED::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
   (void)vid;
   deallocData(m_A);
   deallocData(m_B);
diff --git a/src/basic/MAT_MAT_SHARED.hpp b/src/basic/MAT_MAT_SHARED.hpp
index 95b799eb8..095721c27 100644
--- a/src/basic/MAT_MAT_SHARED.hpp
+++ b/src/basic/MAT_MAT_SHARED.hpp
@@ -84,56 +84,41 @@ constexpr rajaperf::Index_type TL_SZ = 16;
  functions. Nvcc doesn't look at host only code when it does the device pass
  so it doesn't see these kind of problems.
  */
-#define MAT_MAT_SHARED_BODY_0_CLANG_HIP_CPU                   \
-  double As[TL_SZ][TL_SZ];                                    \
-  double Bs[TL_SZ][TL_SZ];                                    \
-  double Cs[TL_SZ][TL_SZ];
-
-#define MAT_MAT_SHARED_BODY_0                                                  \
-  RAJA_TEAM_SHARED double As[TL_SZ][TL_SZ];                                    \
-  RAJA_TEAM_SHARED double Bs[TL_SZ][TL_SZ];                                    \
-  RAJA_TEAM_SHARED double Cs[TL_SZ][TL_SZ];
-
-#define MAT_MAT_SHARED_BODY_1 Cs[ty][tx] = 0;
-
-#define MAT_MAT_SHARED_BODY_2                                                  \
-  const Index_type Row = by * TL_SZ + ty;                                      \
-  const Index_type Col = bx * TL_SZ + tx;                                      \
-  if (k * TL_SZ + tx < N && Row < N)                                           \
-    As[ty][tx] = A[Row * N + k * TL_SZ + tx];                                  \
+#define MAT_MAT_SHARED_BODY_0_CLANG_HIP_CPU(tile_size)                         \
+  double As[tile_size][tile_size];                                             \
+  double Bs[tile_size][tile_size];                                             \
+  double Cs[tile_size][tile_size];
+
+#define MAT_MAT_SHARED_BODY_0(tile_size)                                       \
+  RAJA_TEAM_SHARED double As[tile_size][tile_size];                            \
+  RAJA_TEAM_SHARED double Bs[tile_size][tile_size];                            \
+  RAJA_TEAM_SHARED double Cs[tile_size][tile_size];
+
+#define MAT_MAT_SHARED_BODY_1(tile_size)                                       \
+  Cs[ty][tx] = 0;
+
+#define MAT_MAT_SHARED_BODY_2(tile_size)                                       \
+  const Index_type Row = by * tile_size + ty;                                  \
+  const Index_type Col = bx * tile_size + tx;                                  \
+  if (k * tile_size + tx < N && Row < N)                                       \
+    As[ty][tx] = A[Row * N + k * tile_size + tx];                              \
   else                                                                         \
     As[ty][tx] = 0.0;                                                          \
-  if (k * TL_SZ + ty < N && Col < N)                                           \
-    Bs[ty][tx] = B[(k * TL_SZ + ty) * N + Col];                                \
+  if (k * tile_size + ty < N && Col < N)                                       \
+    Bs[ty][tx] = B[(k * tile_size + ty) * N + Col];                            \
   else                                                                         \
     Bs[ty][tx] = 0.0;
 
-#define MAT_MAT_SHARED_BODY_3                                                  \
-  for (Index_type n = 0; n < TL_SZ; ++n)                                       \
+#define MAT_MAT_SHARED_BODY_3(tile_size)                                       \
+  for (Index_type n = 0; n < tile_size; ++n)                                   \
     Cs[ty][tx] += As[ty][n] * Bs[n][tx];
 
-#define MAT_MAT_SHARED_BODY_4                                                  \
-  const Index_type Row = by * TL_SZ + ty;                                      \
-  const Index_type Col = bx * TL_SZ + tx;                                      \
+#define MAT_MAT_SHARED_BODY_4(tile_size)                                       \
+  const Index_type Row = by * tile_size + ty;                                  \
+  const Index_type Col = bx * tile_size + tx;                                  \
   if (Row < N && Col < N)                                                      \
     C[Col + N * Row] = Cs[ty][tx];
 
-#if defined(RAJA_ENABLE_CUDA)
-    using mms_device_launch = RAJA::expt::cuda_launch_t<true>;
-    using mms_gpu_block_x_policy = RAJA::cuda_block_x_direct;
-    using mms_gpu_block_y_policy = RAJA::cuda_block_y_direct;
-    using mms_gpu_thread_x_policy = RAJA::cuda_thread_x_direct;
-    using mms_gpu_thread_y_policy = RAJA::cuda_thread_y_direct;
-#endif
-
-#if defined(RAJA_ENABLE_HIP)
-    using mms_device_launch = RAJA::expt::hip_launch_t<true>;
-    using mms_gpu_block_x_policy = RAJA::hip_block_x_direct;
-    using mms_gpu_block_y_policy = RAJA::hip_block_y_direct;
-    using mms_gpu_thread_x_policy = RAJA::hip_thread_x_direct;
-    using mms_gpu_thread_y_policy = RAJA::hip_thread_y_direct;
-#endif
-
 namespace rajaperf {
 class RunParams;
 
@@ -145,17 +130,27 @@ class MAT_MAT_SHARED : public KernelBase {
 
   ~MAT_MAT_SHARED();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = TL_SZ * TL_SZ;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size, gpu_block_size::ExactSqrt>;
+
   Real_ptr m_A;
   Real_ptr m_B;
   Real_ptr m_C;
diff --git a/src/basic/MULADDSUB-Cuda.cpp b/src/basic/MULADDSUB-Cuda.cpp
index 106d11865..3d8254c07 100644
--- a/src/basic/MULADDSUB-Cuda.cpp
+++ b/src/basic/MULADDSUB-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace basic
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define MULADDSUB_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(out1, m_out1, iend); \
   allocAndInitCudaDeviceData(out2, m_out2, iend); \
@@ -44,18 +38,22 @@ namespace basic
   deallocCudaDeviceData(in1); \
   deallocCudaDeviceData(in2);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void muladdsub(Real_ptr out1, Real_ptr out2, Real_ptr out3,
                           Real_ptr in1, Real_ptr in2,
                           Index_type iend)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < iend) {
     MULADDSUB_BODY;
   }
 }
 
 
-void MULADDSUB::runCudaVariant(VariantID vid)
+
+template < size_t block_size >
+void MULADDSUB::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -71,7 +69,7 @@ void MULADDSUB::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      muladdsub<<<grid_size, block_size>>>( out1, out2, out3, in1, in2,
+      muladdsub<block_size><<<grid_size, block_size>>>( out1, out2, out3, in1, in2,
                                             iend );
       cudaErrchk( cudaGetLastError() );
 
@@ -88,7 +86,7 @@ void MULADDSUB::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      lambda_cuda_forall<<<grid_size, block_size>>>(
+      lambda_cuda_forall<block_size><<<grid_size, block_size>>>(
         ibegin, iend, [=] __device__ (Index_type i) {
         MULADDSUB_BODY;
       });
@@ -117,10 +115,12 @@ void MULADDSUB::runCudaVariant(VariantID vid)
     MULADDSUB_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  MULADDSUB : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  MULADDSUB : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MULADDSUB, Cuda)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/MULADDSUB-Hip.cpp b/src/basic/MULADDSUB-Hip.cpp
index 729c6cee3..cb9076b38 100644
--- a/src/basic/MULADDSUB-Hip.cpp
+++ b/src/basic/MULADDSUB-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace basic
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define MULADDSUB_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(out1, m_out1, iend); \
   allocAndInitHipDeviceData(out2, m_out2, iend); \
@@ -44,18 +38,22 @@ namespace basic
   deallocHipDeviceData(in1); \
   deallocHipDeviceData(in2);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void muladdsub(Real_ptr out1, Real_ptr out2, Real_ptr out3,
                           Real_ptr in1, Real_ptr in2,
                           Index_type iend)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < iend) {
     MULADDSUB_BODY;
   }
 }
 
 
-void MULADDSUB::runHipVariant(VariantID vid)
+
+template < size_t block_size >
+void MULADDSUB::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -71,7 +69,7 @@ void MULADDSUB::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((muladdsub), dim3(grid_size), dim3(block_size), 0, 0,
+      hipLaunchKernelGGL((muladdsub<block_size>), dim3(grid_size), dim3(block_size), 0, 0,
           out1, out2, out3, in1, in2, iend );
       hipErrchk( hipGetLastError() );
 
@@ -92,7 +90,7 @@ void MULADDSUB::runHipVariant(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL(lambda_hip_forall<decltype(muladdsub_lambda)>,
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(muladdsub_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, muladdsub_lambda );
       hipErrchk( hipGetLastError() );
 
@@ -119,10 +117,12 @@ void MULADDSUB::runHipVariant(VariantID vid)
     MULADDSUB_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  MULADDSUB : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  MULADDSUB : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MULADDSUB, Hip)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/MULADDSUB-OMP.cpp b/src/basic/MULADDSUB-OMP.cpp
index 1794a11d7..1204e9018 100644
--- a/src/basic/MULADDSUB-OMP.cpp
+++ b/src/basic/MULADDSUB-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
 
 
-void MULADDSUB::runOpenMPVariant(VariantID vid)
+void MULADDSUB::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -81,12 +81,12 @@ void MULADDSUB::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  MULADDSUB : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  MULADDSUB : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/basic/MULADDSUB-OMPTarget.cpp b/src/basic/MULADDSUB-OMPTarget.cpp
index 064628d61..2048284b5 100644
--- a/src/basic/MULADDSUB-OMPTarget.cpp
+++ b/src/basic/MULADDSUB-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
@@ -47,7 +47,7 @@ namespace basic
   deallocOpenMPDeviceData(in2, did);
 
 
-void MULADDSUB::runOpenMPTargetVariant(VariantID vid)
+void MULADDSUB::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -91,7 +91,7 @@ void MULADDSUB::runOpenMPTargetVariant(VariantID vid)
     MULADDSUB_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  MULADDSUB : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  MULADDSUB : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/basic/MULADDSUB-Seq.cpp b/src/basic/MULADDSUB-Seq.cpp
index b4651c55f..e93da7871 100644
--- a/src/basic/MULADDSUB-Seq.cpp
+++ b/src/basic/MULADDSUB-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
 
 
-void MULADDSUB::runSeqVariant(VariantID vid)
+void MULADDSUB::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -79,7 +79,7 @@ void MULADDSUB::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  MULADDSUB : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  MULADDSUB : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp
index baa201dc1..d1c180b8e 100644
--- a/src/basic/MULADDSUB.cpp
+++ b/src/basic/MULADDSUB.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -57,7 +57,7 @@ MULADDSUB::~MULADDSUB()
 {
 }
 
-void MULADDSUB::setUp(VariantID vid)
+void MULADDSUB::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_out1, getActualProblemSize(), 0.0, vid);
   allocAndInitDataConst(m_out2, getActualProblemSize(), 0.0, vid);
@@ -66,14 +66,14 @@ void MULADDSUB::setUp(VariantID vid)
   allocAndInitData(m_in2, getActualProblemSize(), vid);
 }
 
-void MULADDSUB::updateChecksum(VariantID vid)
+void MULADDSUB::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_out1, getActualProblemSize());
-  checksum[vid] += calcChecksum(m_out2, getActualProblemSize());
-  checksum[vid] += calcChecksum(m_out3, getActualProblemSize());
+  checksum[vid][tune_idx] += calcChecksum(m_out1, getActualProblemSize());
+  checksum[vid][tune_idx] += calcChecksum(m_out2, getActualProblemSize());
+  checksum[vid][tune_idx] += calcChecksum(m_out3, getActualProblemSize());
 }
 
-void MULADDSUB::tearDown(VariantID vid)
+void MULADDSUB::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_out1);
diff --git a/src/basic/MULADDSUB.hpp b/src/basic/MULADDSUB.hpp
index afb0a5f38..30ad11a54 100644
--- a/src/basic/MULADDSUB.hpp
+++ b/src/basic/MULADDSUB.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -49,17 +49,27 @@ class MULADDSUB : public KernelBase
 
   ~MULADDSUB();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_out1;
   Real_ptr m_out2;
   Real_ptr m_out3;
diff --git a/src/basic/NESTED_INIT-Cuda.cpp b/src/basic/NESTED_INIT-Cuda.cpp
index 306a9a67a..7528c5cec 100644
--- a/src/basic/NESTED_INIT-Cuda.cpp
+++ b/src/basic/NESTED_INIT-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,14 +22,18 @@ namespace basic
 {
 
   //
-  // Define thread block size for CUDA execution
+  // Define thread block shape for CUDA execution
   //
-  constexpr size_t i_block_sz = 32;
-  constexpr size_t j_block_sz = 8;
-  constexpr size_t k_block_sz = 1;
+#define i_block_sz (32)
+#define j_block_sz (block_size / i_block_sz)
+#define k_block_sz (1)
+
+#define NESTED_INIT_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \
+  i_block_sz, j_block_sz, k_block_sz
 
 #define NESTED_INIT_THREADS_PER_BLOCK_CUDA \
-  dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz);
+  dim3 nthreads_per_block(NESTED_INIT_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA); \
+  static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, "Invalid block_size");
 
 #define NESTED_INIT_NBLOCKS_CUDA \
   dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(ni, i_block_sz)), \
@@ -44,11 +48,13 @@ namespace basic
   getCudaDeviceData(m_array, array, m_array_length); \
   deallocCudaDeviceData(array);
 
+template< size_t i_block_size, size_t j_block_size, size_t k_block_size >
+__launch_bounds__(i_block_size*j_block_size*k_block_size)
 __global__ void nested_init(Real_ptr array,
                             Index_type ni, Index_type nj, Index_type nk)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-  Index_type j = blockIdx.y * blockDim.y + threadIdx.y;
+  Index_type i = blockIdx.x * i_block_size + threadIdx.x;
+  Index_type j = blockIdx.y * j_block_size + threadIdx.y;
   Index_type k = blockIdx.z;
 
   if ( i < ni && j < nj && k < nk ) {
@@ -56,21 +62,24 @@ __global__ void nested_init(Real_ptr array,
   }
 }
 
-template< typename Lambda >
+template< size_t i_block_size, size_t j_block_size, size_t k_block_size, typename Lambda >
+__launch_bounds__(i_block_size*j_block_size*k_block_size)
 __global__ void nested_init_lam(Index_type ni, Index_type nj, Index_type nk,
                                 Lambda body)
-{ 
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-  Index_type j = blockIdx.y * blockDim.y + threadIdx.y;
+{
+  Index_type i = blockIdx.x * i_block_size + threadIdx.x;
+  Index_type j = blockIdx.y * j_block_size + threadIdx.y;
   Index_type k = blockIdx.z;
-    
+
   if ( i < ni && j < nj && k < nk ) {
     body(i, j, k);
   }
 }
 
 
-void NESTED_INIT::runCudaVariant(VariantID vid)
+
+template < size_t block_size >
+void NESTED_INIT::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -85,8 +94,9 @@ void NESTED_INIT::runCudaVariant(VariantID vid)
 
       NESTED_INIT_THREADS_PER_BLOCK_CUDA;
       NESTED_INIT_NBLOCKS_CUDA;
- 
-      nested_init<<<nblocks, nthreads_per_block>>>(array,
+
+      nested_init<NESTED_INIT_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                 <<<nblocks, nthreads_per_block>>>(array,
                                                    ni, nj, nk);
       cudaErrchk( cudaGetLastError() );
 
@@ -105,7 +115,8 @@ void NESTED_INIT::runCudaVariant(VariantID vid)
       NESTED_INIT_THREADS_PER_BLOCK_CUDA;
       NESTED_INIT_NBLOCKS_CUDA;
 
-      nested_init_lam<<<nblocks, nthreads_per_block>>>(ni, nj, nk,
+      nested_init_lam<NESTED_INIT_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                     <<<nblocks, nthreads_per_block>>>(ni, nj, nk,
         [=] __device__ (Index_type i, Index_type j, Index_type k) {
           NESTED_INIT_BODY;
         }
@@ -124,13 +135,13 @@ void NESTED_INIT::runCudaVariant(VariantID vid)
     using EXEC_POL =
       RAJA::KernelPolicy<
         RAJA::statement::CudaKernelFixedAsync<i_block_sz * j_block_sz,
-          RAJA::statement::Tile<1, RAJA::tile_fixed<j_block_sz>, 
+          RAJA::statement::Tile<1, RAJA::tile_fixed<j_block_sz>,
                                    RAJA::cuda_block_y_direct,
-            RAJA::statement::Tile<0, RAJA::tile_fixed<i_block_sz>, 
+            RAJA::statement::Tile<0, RAJA::tile_fixed<i_block_sz>,
                                      RAJA::cuda_block_x_direct,
               RAJA::statement::For<2, RAJA::cuda_block_z_direct,      // k
                 RAJA::statement::For<1, RAJA::cuda_thread_y_direct,   // j
-                  RAJA::statement::For<0, RAJA::cuda_thread_x_direct, // i 
+                  RAJA::statement::For<0, RAJA::cuda_thread_x_direct, // i
                     RAJA::statement::Lambda<0>
                   >
                 >
@@ -157,10 +168,12 @@ void NESTED_INIT::runCudaVariant(VariantID vid)
     NESTED_INIT_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  NESTED_INIT : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  NESTED_INIT : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(NESTED_INIT, Cuda)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/NESTED_INIT-Hip.cpp b/src/basic/NESTED_INIT-Hip.cpp
index 4038a47a2..49c050f6f 100644
--- a/src/basic/NESTED_INIT-Hip.cpp
+++ b/src/basic/NESTED_INIT-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,14 +22,18 @@ namespace basic
 {
 
   //
-  // Define thread block size for Hip execution
+  // Define thread block shape for Hip execution
   //
-  constexpr size_t i_block_sz = 32;
-  constexpr size_t j_block_sz = 8;
-  constexpr size_t k_block_sz = 1;
+#define i_block_sz (32)
+#define j_block_sz (block_size / i_block_sz)
+#define k_block_sz (1)
+
+#define NESTED_INIT_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \
+  i_block_sz, j_block_sz, k_block_sz
 
 #define NESTED_INIT_THREADS_PER_BLOCK_HIP \
-  dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz);
+  dim3 nthreads_per_block(NESTED_INIT_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP); \
+  static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, "Invalid block_size");
 
 #define NESTED_INIT_NBLOCKS_HIP \
   dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(ni, i_block_sz)), \
@@ -44,11 +48,13 @@ namespace basic
   getHipDeviceData(m_array, array, m_array_length); \
   deallocHipDeviceData(array);
 
+template< size_t i_block_size, size_t j_block_size, size_t k_block_size >
+  __launch_bounds__(i_block_size*j_block_size*k_block_size)
 __global__ void nested_init(Real_ptr array,
                             Index_type ni, Index_type nj, Index_type nk)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-  Index_type j = blockIdx.y * blockDim.y + threadIdx.y;
+  Index_type i = blockIdx.x * i_block_size + threadIdx.x;
+  Index_type j = blockIdx.y * j_block_size + threadIdx.y;
   Index_type k = blockIdx.z;
 
   if ( i < ni && j < nj && k < nk ) {
@@ -56,12 +62,13 @@ __global__ void nested_init(Real_ptr array,
   }
 }
 
-template<typename Lambda >
+template< size_t i_block_size, size_t j_block_size, size_t k_block_size, typename Lambda >
+__launch_bounds__(i_block_size*j_block_size*k_block_size)
 __global__ void nested_init_lam(Index_type ni, Index_type nj, Index_type nk,
                                 Lambda body)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-  Index_type j = blockIdx.y * blockDim.y + threadIdx.y;
+  Index_type i = blockIdx.x * i_block_size + threadIdx.x;
+  Index_type j = blockIdx.y * j_block_size + threadIdx.y;
   Index_type k = blockIdx.z;
 
   if ( i < ni && j < nj && k < nk ) {
@@ -70,7 +77,9 @@ __global__ void nested_init_lam(Index_type ni, Index_type nj, Index_type nk,
 }
 
 
-void NESTED_INIT::runHipVariant(VariantID vid)
+
+template < size_t block_size >
+void NESTED_INIT::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -86,8 +95,8 @@ void NESTED_INIT::runHipVariant(VariantID vid)
       NESTED_INIT_THREADS_PER_BLOCK_HIP;
       NESTED_INIT_NBLOCKS_HIP;
 
-      hipLaunchKernelGGL((nested_init), 
-                         dim3(nblocks), dim3(nthreads_per_block), 0, 0, 
+      hipLaunchKernelGGL((nested_init<NESTED_INIT_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+                         dim3(nblocks), dim3(nthreads_per_block), 0, 0,
                          array, ni, nj, nk);
       hipErrchk( hipGetLastError() );
 
@@ -106,12 +115,12 @@ void NESTED_INIT::runHipVariant(VariantID vid)
       NESTED_INIT_THREADS_PER_BLOCK_HIP;
       NESTED_INIT_NBLOCKS_HIP;
 
-      auto nested_init_lambda = [=] __device__ (Index_type i, Index_type j, 
+      auto nested_init_lambda = [=] __device__ (Index_type i, Index_type j,
                                                 Index_type k) {
         NESTED_INIT_BODY;
       };
 
-      hipLaunchKernelGGL((nested_init_lam< decltype(nested_init_lambda) >), 
+      hipLaunchKernelGGL((nested_init_lam<NESTED_INIT_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(nested_init_lambda) >),
                          dim3(nblocks), dim3(nthreads_per_block), 0, 0,
                          ni, nj, nk, nested_init_lambda);
       hipErrchk( hipGetLastError() );
@@ -142,7 +151,7 @@ void NESTED_INIT::runHipVariant(VariantID vid)
             >
           >
         >
-      >; 
+      >;
 
 
     startTimer();
@@ -161,10 +170,12 @@ void NESTED_INIT::runHipVariant(VariantID vid)
     NESTED_INIT_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  NESTED_INIT : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  NESTED_INIT : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(NESTED_INIT, Hip)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/NESTED_INIT-OMP.cpp b/src/basic/NESTED_INIT-OMP.cpp
index b714712d5..4471740df 100644
--- a/src/basic/NESTED_INIT-OMP.cpp
+++ b/src/basic/NESTED_INIT-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,7 +12,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
@@ -21,10 +21,10 @@ namespace basic
 #undef USE_OMP_COLLAPSE
 
 
-void NESTED_INIT::runOpenMPVariant(VariantID vid)
+void NESTED_INIT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
- 
+
   const Index_type run_reps = getRunReps();
 
   NESTED_INIT_DATA_SETUP;
@@ -94,7 +94,7 @@ void NESTED_INIT::runOpenMPVariant(VariantID vid)
           >
         >;
 #else
-      using EXEC_POL = 
+      using EXEC_POL =
         RAJA::KernelPolicy<
           RAJA::statement::For<2, RAJA::omp_parallel_for_exec,  // k
             RAJA::statement::For<1, RAJA::loop_exec,            // j
@@ -122,12 +122,12 @@ void NESTED_INIT::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  NESTED_INIT : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  NESTED_INIT : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/basic/NESTED_INIT-OMPTarget.cpp b/src/basic/NESTED_INIT-OMPTarget.cpp
index 435df40c1..2c0b2389f 100644
--- a/src/basic/NESTED_INIT-OMPTarget.cpp
+++ b/src/basic/NESTED_INIT-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
@@ -32,7 +32,7 @@ namespace basic
   deallocOpenMPDeviceData(array, did);
 
 
-void NESTED_INIT::runOpenMPTargetVariant(VariantID vid)
+void NESTED_INIT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -46,14 +46,14 @@ void NESTED_INIT::runOpenMPTargetVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       #pragma omp target is_device_ptr(array) device( did )
-      #pragma omp teams distribute parallel for schedule(static, 1) collapse(3) 
+      #pragma omp teams distribute parallel for schedule(static, 1) collapse(3)
       for (Index_type k = 0; k < nk; ++k ) {
         for (Index_type j = 0; j < nj; ++j ) {
           for (Index_type i = 0; i < ni; ++i ) {
             NESTED_INIT_BODY;
           }
         }
-      }  
+      }
 
     }
     stopTimer();
@@ -64,7 +64,7 @@ void NESTED_INIT::runOpenMPTargetVariant(VariantID vid)
 
     NESTED_INIT_DATA_SETUP_OMP_TARGET;
 
-    using EXEC_POL = 
+    using EXEC_POL =
       RAJA::KernelPolicy<
         RAJA::statement::Collapse<RAJA::omp_target_parallel_collapse_exec,
                                   RAJA::ArgList<2, 1, 0>, // k, j, i
@@ -87,8 +87,8 @@ void NESTED_INIT::runOpenMPTargetVariant(VariantID vid)
 
     NESTED_INIT_DATA_TEARDOWN_OMP_TARGET;
 
-  } else { 
-     std::cout << "\n  NESTED_INIT : Unknown variant id = " << vid << std::endl;
+  } else {
+     getCout() << "\n  NESTED_INIT : Unknown variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/basic/NESTED_INIT-Seq.cpp b/src/basic/NESTED_INIT-Seq.cpp
index f79cc9603..48da1b37a 100644
--- a/src/basic/NESTED_INIT-Seq.cpp
+++ b/src/basic/NESTED_INIT-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
 
 
-void NESTED_INIT::runSeqVariant(VariantID vid)
+void NESTED_INIT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -71,7 +71,7 @@ void NESTED_INIT::runSeqVariant(VariantID vid)
 
     case RAJA_Seq : {
 
-      using EXEC_POL = 
+      using EXEC_POL =
         RAJA::KernelPolicy<
           RAJA::statement::For<2, RAJA::loop_exec,    // k
             RAJA::statement::For<1, RAJA::loop_exec,  // j
@@ -99,7 +99,7 @@ void NESTED_INIT::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  NESTED_INIT : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  NESTED_INIT : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp
index 77d847691..ef9550d97 100644
--- a/src/basic/NESTED_INIT.cpp
+++ b/src/basic/NESTED_INIT.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -68,17 +68,17 @@ NESTED_INIT::~NESTED_INIT()
 {
 }
 
-void NESTED_INIT::setUp(VariantID vid)
+void NESTED_INIT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_array, m_array_length, 0.0, vid);
 }
 
-void NESTED_INIT::updateChecksum(VariantID vid)
+void NESTED_INIT::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_array, m_array_length);
+  checksum[vid][tune_idx] += calcChecksum(m_array, m_array_length);
 }
 
-void NESTED_INIT::tearDown(VariantID vid)
+void NESTED_INIT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   RAJA::free_aligned(m_array);
diff --git a/src/basic/NESTED_INIT.hpp b/src/basic/NESTED_INIT.hpp
index 508ba8030..13da52cf2 100644
--- a/src/basic/NESTED_INIT.hpp
+++ b/src/basic/NESTED_INIT.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -49,17 +49,28 @@ class NESTED_INIT : public KernelBase
 
   ~NESTED_INIT();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
+                                                         gpu_block_size::MultipleOf<32>>;
+
   Index_type m_array_length;
 
   Real_ptr m_array;
diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp
index 57522fed3..6f28f8c2a 100644
--- a/src/basic/PI_ATOMIC-Cuda.cpp
+++ b/src/basic/PI_ATOMIC-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,23 +21,19 @@ namespace rajaperf
 namespace basic
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define PI_ATOMIC_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(pi, m_pi, 1);
 
 #define PI_ATOMIC_DATA_TEARDOWN_CUDA \
   deallocCudaDeviceData(pi);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void pi_atomic(Real_ptr pi,
                           Real_type dx,
                           Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      double x = (double(i) + 0.5) * dx;
      RAJA::atomicAdd<RAJA::cuda_atomic>(pi, dx / (1.0 + x * x));
@@ -45,7 +41,9 @@ __global__ void pi_atomic(Real_ptr pi,
 }
 
 
-void PI_ATOMIC::runCudaVariant(VariantID vid)
+
+template < size_t block_size >
+void PI_ATOMIC::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -63,7 +61,7 @@ void PI_ATOMIC::runCudaVariant(VariantID vid)
       initCudaDeviceData(pi, &m_pi_init, 1);
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      pi_atomic<<<grid_size, block_size>>>( pi, dx, iend );
+      pi_atomic<block_size><<<grid_size, block_size>>>( pi, dx, iend );
       cudaErrchk( cudaGetLastError() );
 
       getCudaDeviceData(m_pi, pi, 1);
@@ -84,7 +82,7 @@ void PI_ATOMIC::runCudaVariant(VariantID vid)
       initCudaDeviceData(pi, &m_pi_init, 1);
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      lambda_cuda_forall<<<grid_size, block_size>>>(
+      lambda_cuda_forall<block_size><<<grid_size, block_size>>>(
         ibegin, iend, [=] __device__ (Index_type i) {
           double x = (double(i) + 0.5) * dx;
           RAJA::atomicAdd<RAJA::cuda_atomic>(pi, dx / (1.0 + x * x));
@@ -123,10 +121,12 @@ void PI_ATOMIC::runCudaVariant(VariantID vid)
     PI_ATOMIC_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  PI_ATOMIC : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  PI_ATOMIC : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(PI_ATOMIC, Cuda)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp
index 0910a4198..605696676 100644
--- a/src/basic/PI_ATOMIC-Hip.cpp
+++ b/src/basic/PI_ATOMIC-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,23 +21,19 @@ namespace rajaperf
 namespace basic
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define PI_ATOMIC_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(pi, m_pi, 1);
 
 #define PI_ATOMIC_DATA_TEARDOWN_HIP \
   deallocHipDeviceData(pi);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void atomic_pi(Real_ptr pi,
                           Real_type dx,
                           Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      double x = (double(i) + 0.5) * dx;
      RAJA::atomicAdd<RAJA::hip_atomic>(pi, dx / (1.0 + x * x));
@@ -45,7 +41,9 @@ __global__ void atomic_pi(Real_ptr pi,
 }
 
 
-void PI_ATOMIC::runHipVariant(VariantID vid)
+
+template < size_t block_size >
+void PI_ATOMIC::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -63,7 +61,7 @@ void PI_ATOMIC::runHipVariant(VariantID vid)
       initHipDeviceData(pi, &m_pi_init, 1);
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL(atomic_pi,grid_size, block_size, 0, 0, pi, dx, iend );
+      hipLaunchKernelGGL((atomic_pi<block_size>),grid_size, block_size, 0, 0, pi, dx, iend );
       hipErrchk( hipGetLastError() );
 
       getHipDeviceData(m_pi, pi, 1);
@@ -89,7 +87,7 @@ void PI_ATOMIC::runHipVariant(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL(lambda_hip_forall<decltype(atomic_pi_lambda)>,
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(atomic_pi_lambda)>),
           grid_size, block_size, 0, 0, ibegin, iend, atomic_pi_lambda);
       hipErrchk( hipGetLastError() );
 
@@ -125,10 +123,12 @@ void PI_ATOMIC::runHipVariant(VariantID vid)
     PI_ATOMIC_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  PI_ATOMIC : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  PI_ATOMIC : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(PI_ATOMIC, Hip)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/PI_ATOMIC-OMP.cpp b/src/basic/PI_ATOMIC-OMP.cpp
index b30352de7..4296ed845 100644
--- a/src/basic/PI_ATOMIC-OMP.cpp
+++ b/src/basic/PI_ATOMIC-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
 
 
-void PI_ATOMIC::runOpenMPVariant(VariantID vid)
+void PI_ATOMIC::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -40,7 +40,7 @@ void PI_ATOMIC::runOpenMPVariant(VariantID vid)
         for (Index_type i = ibegin; i < iend; ++i ) {
           double x = (double(i) + 0.5) * dx;
           #pragma omp atomic
-          *pi += dx / (1.0 + x * x); 
+          *pi += dx / (1.0 + x * x);
         }
         *pi *= 4.0;
 
@@ -80,7 +80,7 @@ void PI_ATOMIC::runOpenMPVariant(VariantID vid)
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
         *pi = m_pi_init;
-        RAJA::forall<RAJA::omp_parallel_for_exec>( 
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
           RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
             double x = (double(i) + 0.5) * dx;
             RAJA::atomicAdd<RAJA::omp_atomic>(pi, dx / (1.0 + x * x));
@@ -94,12 +94,12 @@ void PI_ATOMIC::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  PI_ATOMIC : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  PI_ATOMIC : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/basic/PI_ATOMIC-OMPTarget.cpp b/src/basic/PI_ATOMIC-OMPTarget.cpp
index 08cc41167..2a059f99b 100644
--- a/src/basic/PI_ATOMIC-OMPTarget.cpp
+++ b/src/basic/PI_ATOMIC-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
@@ -36,7 +36,7 @@ namespace basic
   deallocOpenMPDeviceData(pi, did);
 
 
-void PI_ATOMIC::runOpenMPTargetVariant(VariantID vid)
+void PI_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -52,7 +52,7 @@ void PI_ATOMIC::runOpenMPTargetVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       initOpenMPDeviceData(pi, &m_pi_init, 1, did, hid);
-      
+
       #pragma omp target is_device_ptr(pi) device( did )
       #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
       for (Index_type i = ibegin; i < iend; ++i ) {
@@ -84,16 +84,16 @@ void PI_ATOMIC::runOpenMPTargetVariant(VariantID vid)
           RAJA::atomicAdd<RAJA::omp_atomic>(pi, dx / (1.0 + x * x));
       });
 
-      getOpenMPDeviceData(m_pi, pi, 1, hid, did); 
+      getOpenMPDeviceData(m_pi, pi, 1, hid, did);
       *m_pi *= 4.0;
 
     }
     stopTimer();
 
     PI_ATOMIC_DATA_TEARDOWN_OMP_TARGET;
-  
+
   } else {
-     std::cout << "\n  PI_ATOMIC : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  PI_ATOMIC : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/basic/PI_ATOMIC-Seq.cpp b/src/basic/PI_ATOMIC-Seq.cpp
index 941062fed..486201caa 100644
--- a/src/basic/PI_ATOMIC-Seq.cpp
+++ b/src/basic/PI_ATOMIC-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
 
 
-void PI_ATOMIC::runSeqVariant(VariantID vid)
+void PI_ATOMIC::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -73,9 +73,9 @@ void PI_ATOMIC::runSeqVariant(VariantID vid)
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-   
+
         *pi = m_pi_init;
-        RAJA::forall<RAJA::loop_exec>( RAJA::RangeSegment(ibegin, iend), 
+        RAJA::forall<RAJA::loop_exec>( RAJA::RangeSegment(ibegin, iend),
           [=](Index_type i) {
             double x = (double(i) + 0.5) * dx;
             RAJA::atomicAdd<RAJA::seq_atomic>(pi, dx / (1.0 + x * x));
@@ -90,7 +90,7 @@ void PI_ATOMIC::runSeqVariant(VariantID vid)
 #endif
 
     default : {
-      std::cout << "\n  PI_ATOMIC : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  PI_ATOMIC : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp
index 94e29c8ae..776883232 100644
--- a/src/basic/PI_ATOMIC.cpp
+++ b/src/basic/PI_ATOMIC.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -59,19 +59,19 @@ PI_ATOMIC::~PI_ATOMIC()
 {
 }
 
-void PI_ATOMIC::setUp(VariantID vid)
+void PI_ATOMIC::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   m_dx = 1.0 / double(getActualProblemSize());
   allocAndInitDataConst(m_pi, 1, 0.0, vid);
   m_pi_init = 0.0;
 }
 
-void PI_ATOMIC::updateChecksum(VariantID vid)
+void PI_ATOMIC::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += Checksum_type(*m_pi);
+  checksum[vid][tune_idx] += Checksum_type(*m_pi);
 }
 
-void PI_ATOMIC::tearDown(VariantID vid)
+void PI_ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_pi);
diff --git a/src/basic/PI_ATOMIC.hpp b/src/basic/PI_ATOMIC.hpp
index 9c71d2d70..10c674dda 100644
--- a/src/basic/PI_ATOMIC.hpp
+++ b/src/basic/PI_ATOMIC.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -45,17 +45,27 @@ class PI_ATOMIC : public KernelBase
 
   ~PI_ATOMIC();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_type m_dx;
   Real_ptr m_pi;
   Real_type m_pi_init;
diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp
index 0c9d38c13..80c8fd3b4 100644
--- a/src/basic/PI_REDUCE-Cuda.cpp
+++ b/src/basic/PI_REDUCE-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,28 +21,24 @@ namespace rajaperf
 namespace basic
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void pi_reduce(Real_type dx,
                           Real_ptr dpi, Real_type pi_init,
                           Index_type iend)
 {
   extern __shared__ Real_type ppi[ ];
 
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
 
   ppi[ threadIdx.x ] = pi_init;
-  for ( ; i < iend ; i += gridDim.x * blockDim.x ) {
+  for ( ; i < iend ; i += gridDim.x * block_size ) {
     double x = (double(i) + 0.5) * dx;
-    ppi[ threadIdx.x ] += dx / (1.0 + x * x); 
+    ppi[ threadIdx.x ] += dx / (1.0 + x * x);
   }
   __syncthreads();
 
-  for ( i = blockDim.x / 2; i > 0; i /= 2 ) {
+  for ( i = block_size / 2; i > 0; i /= 2 ) {
     if ( threadIdx.x < i ) {
       ppi[ threadIdx.x ] += ppi[ threadIdx.x + i ];
     }
@@ -57,11 +53,13 @@ __global__ void pi_reduce(Real_type dx,
   if ( threadIdx.x == 0 ) {
     *dpi += ppi[ 0 ];
   }
-#endif  
+#endif
 }
 
 
-void PI_REDUCE::runCudaVariant(VariantID vid)
+
+template < size_t block_size >
+void PI_REDUCE::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -80,9 +78,9 @@ void PI_REDUCE::runCudaVariant(VariantID vid)
       initCudaDeviceData(dpi, &m_pi_init, 1);
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      pi_reduce<<<grid_size, block_size,
-                  sizeof(Real_type)*block_size>>>( dx, 
-                                                   dpi, m_pi_init, 
+      pi_reduce<block_size><<<grid_size, block_size,
+                  sizeof(Real_type)*block_size>>>( dx,
+                                                   dpi, m_pi_init,
                                                    iend );
       cudaErrchk( cudaGetLastError() );
 
@@ -115,10 +113,12 @@ void PI_REDUCE::runCudaVariant(VariantID vid)
     stopTimer();
 
   } else {
-     std::cout << "\n  PI_REDUCE : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  PI_REDUCE : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(PI_REDUCE, Cuda)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp
index 41a0a8ae9..bb34ed37e 100644
--- a/src/basic/PI_REDUCE-Hip.cpp
+++ b/src/basic/PI_REDUCE-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,28 +21,24 @@ namespace rajaperf
 namespace basic
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void pi_reduce(Real_type dx,
                           Real_ptr dpi, Real_type pi_init,
                           Index_type iend)
 {
   HIP_DYNAMIC_SHARED(Real_type, ppi);
 
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
 
   ppi[ threadIdx.x ] = pi_init;
-  for ( ; i < iend ; i += gridDim.x * blockDim.x ) {
+  for ( ; i < iend ; i += gridDim.x * block_size ) {
     double x = (double(i) + 0.5) * dx;
-    ppi[ threadIdx.x ] += dx / (1.0 + x * x); 
+    ppi[ threadIdx.x ] += dx / (1.0 + x * x);
   }
   __syncthreads();
 
-  for ( i = blockDim.x / 2; i > 0; i /= 2 ) {
+  for ( i = block_size / 2; i > 0; i /= 2 ) {
     if ( threadIdx.x < i ) {
       ppi[ threadIdx.x ] += ppi[ threadIdx.x + i ];
     }
@@ -57,11 +53,13 @@ __global__ void pi_reduce(Real_type dx,
   if ( threadIdx.x == 0 ) i{
     *dpi += ppi[ 0 ];
   }
-#endif  
+#endif
 }
 
 
-void PI_REDUCE::runHipVariant(VariantID vid)
+
+template < size_t block_size >
+void PI_REDUCE::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -80,7 +78,7 @@ void PI_REDUCE::runHipVariant(VariantID vid)
       initHipDeviceData(dpi, &m_pi_init, 1);
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL( (pi_reduce), dim3(grid_size), dim3(block_size), 
+      hipLaunchKernelGGL( (pi_reduce<block_size>), dim3(grid_size), dim3(block_size),
                           sizeof(Real_type)*block_size, 0,
                           dx, dpi, m_pi_init, iend );
       hipErrchk( hipGetLastError() );
@@ -114,10 +112,12 @@ void PI_REDUCE::runHipVariant(VariantID vid)
     stopTimer();
 
   } else {
-     std::cout << "\n  PI_REDUCE : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  PI_REDUCE : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(PI_REDUCE, Hip)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/PI_REDUCE-OMP.cpp b/src/basic/PI_REDUCE-OMP.cpp
index 3261b8e61..bc03012c3 100644
--- a/src/basic/PI_REDUCE-OMP.cpp
+++ b/src/basic/PI_REDUCE-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
 
 
-void PI_REDUCE::runOpenMPVariant(VariantID vid)
+void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -36,7 +36,7 @@ void PI_REDUCE::runOpenMPVariant(VariantID vid)
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
         Real_type pi = m_pi_init;
-        
+
         #pragma omp parallel for reduction(+:pi)
         for (Index_type i = ibegin; i < iend; ++i ) {
           PI_REDUCE_BODY;
@@ -79,11 +79,11 @@ void PI_REDUCE::runOpenMPVariant(VariantID vid)
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-  
-        RAJA::ReduceSum<RAJA::omp_reduce, Real_type> pi(m_pi_init); 
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>( 
-          RAJA::RangeSegment(ibegin, iend), 
+        RAJA::ReduceSum<RAJA::omp_reduce, Real_type> pi(m_pi_init);
+
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend),
           [=](Index_type i) {
             PI_REDUCE_BODY;
         });
@@ -97,12 +97,12 @@ void PI_REDUCE::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  PI_REDUCE : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  PI_REDUCE : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/basic/PI_REDUCE-OMPTarget.cpp b/src/basic/PI_REDUCE-OMPTarget.cpp
index 60eaa4a84..a942839b4 100644
--- a/src/basic/PI_REDUCE-OMPTarget.cpp
+++ b/src/basic/PI_REDUCE-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
@@ -27,7 +27,7 @@ namespace basic
   const size_t threads_per_team = 256;
 
 
-void PI_REDUCE::runOpenMPTargetVariant(VariantID vid)
+void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -43,8 +43,8 @@ void PI_REDUCE::runOpenMPTargetVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       Real_type pi = m_pi_init;
-     
-      #pragma omp target device( did ) map(tofrom:pi) 
+
+      #pragma omp target device( did ) map(tofrom:pi)
       #pragma omp teams distribute parallel for reduction(+:pi) \
               thread_limit(threads_per_team) schedule(static, 1)
       for (Index_type i = ibegin; i < iend; ++i ) {
@@ -60,11 +60,11 @@ void PI_REDUCE::runOpenMPTargetVariant(VariantID vid)
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
- 
-      RAJA::ReduceSum<RAJA::omp_target_reduce, Real_type> pi(m_pi_init); 
+
+      RAJA::ReduceSum<RAJA::omp_target_reduce, Real_type> pi(m_pi_init);
 
       RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
-        RAJA::RangeSegment(ibegin, iend), 
+        RAJA::RangeSegment(ibegin, iend),
         [=](Index_type i) {
           PI_REDUCE_BODY;
       });
@@ -75,7 +75,7 @@ void PI_REDUCE::runOpenMPTargetVariant(VariantID vid)
     stopTimer();
 
   } else {
-    std::cout << "\n  PI_REDUCE : Unknown OMP Target variant id = " << vid << std::endl;
+    getCout() << "\n  PI_REDUCE : Unknown OMP Target variant id = " << vid << std::endl;
   }
 
 }
diff --git a/src/basic/PI_REDUCE-Seq.cpp b/src/basic/PI_REDUCE-Seq.cpp
index 70ee92f79..6d6f885fe 100644
--- a/src/basic/PI_REDUCE-Seq.cpp
+++ b/src/basic/PI_REDUCE-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
 
 
-void PI_REDUCE::runSeqVariant(VariantID vid)
+void PI_REDUCE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -76,10 +76,10 @@ void PI_REDUCE::runSeqVariant(VariantID vid)
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-  
-        RAJA::ReduceSum<RAJA::seq_reduce, Real_type> pi(m_pi_init); 
 
-        RAJA::forall<RAJA::loop_exec>( RAJA::RangeSegment(ibegin, iend), 
+        RAJA::ReduceSum<RAJA::seq_reduce, Real_type> pi(m_pi_init);
+
+        RAJA::forall<RAJA::loop_exec>( RAJA::RangeSegment(ibegin, iend),
           [=](Index_type i) {
             PI_REDUCE_BODY;
         });
@@ -94,7 +94,7 @@ void PI_REDUCE::runSeqVariant(VariantID vid)
 #endif
 
     default : {
-      std::cout << "\n  PI_REDUCE : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  PI_REDUCE : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp
index b7032e61d..16d0770ba 100644
--- a/src/basic/PI_REDUCE.cpp
+++ b/src/basic/PI_REDUCE.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -57,7 +57,7 @@ PI_REDUCE::~PI_REDUCE()
 {
 }
 
-void PI_REDUCE::setUp(VariantID vid)
+void PI_REDUCE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   m_dx = 1.0 / double(getActualProblemSize());
@@ -65,12 +65,12 @@ void PI_REDUCE::setUp(VariantID vid)
   m_pi = 0.0;
 }
 
-void PI_REDUCE::updateChecksum(VariantID vid)
+void PI_REDUCE::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += Checksum_type(m_pi);
+  checksum[vid][tune_idx] += Checksum_type(m_pi);
 }
 
-void PI_REDUCE::tearDown(VariantID vid)
+void PI_REDUCE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
 }
diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp
index 59ea5321a..c7cc3258a 100644
--- a/src/basic/PI_REDUCE.hpp
+++ b/src/basic/PI_REDUCE.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -47,17 +47,27 @@ class PI_REDUCE : public KernelBase
 
   ~PI_REDUCE();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_type m_dx;
   Real_type m_pi;
   Real_type m_pi_init;
diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp
index 50481f5b2..0e7c645e7 100644
--- a/src/basic/REDUCE3_INT-Cuda.cpp
+++ b/src/basic/REDUCE3_INT-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace basic
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define REDUCE3_INT_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(vec, m_vec, iend);
 
@@ -34,6 +28,8 @@ namespace basic
   deallocCudaDeviceData(vec);
 
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void reduce3int(Int_ptr vec,
                            Int_ptr vsum, Int_type vsum_init,
                            Int_ptr vmin, Int_type vmin_init,
@@ -41,23 +37,23 @@ __global__ void reduce3int(Int_ptr vec,
                            Index_type iend)
 {
   extern __shared__ Int_type psum[ ];
-  Int_type* pmin = (Int_type*)&psum[ 1 * blockDim.x ];
-  Int_type* pmax = (Int_type*)&psum[ 2 * blockDim.x ];
+  Int_type* pmin = (Int_type*)&psum[ 1 * block_size ];
+  Int_type* pmax = (Int_type*)&psum[ 2 * block_size ];
 
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
 
   psum[ threadIdx.x ] = vsum_init;
   pmin[ threadIdx.x ] = vmin_init;
   pmax[ threadIdx.x ] = vmax_init;
 
-  for ( ; i < iend ; i += gridDim.x * blockDim.x ) {
+  for ( ; i < iend ; i += gridDim.x * block_size ) {
     psum[ threadIdx.x ] += vec[ i ];
     pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], vec[ i ] );
     pmax[ threadIdx.x ] = RAJA_MAX( pmax[ threadIdx.x ], vec[ i ] );
   }
   __syncthreads();
 
-  for ( i = blockDim.x / 2; i > 0; i /= 2 ) {
+  for ( i = block_size / 2; i > 0; i /= 2 ) {
     if ( threadIdx.x < i ) {
       psum[ threadIdx.x ] += psum[ threadIdx.x + i ];
       pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], pmin[ threadIdx.x + i ] );
@@ -82,7 +78,9 @@ __global__ void reduce3int(Int_ptr vec,
 }
 
 
-void REDUCE3_INT::runCudaVariant(VariantID vid)
+
+template < size_t block_size >
+void REDUCE3_INT::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -110,7 +108,7 @@ void REDUCE3_INT::runCudaVariant(VariantID vid)
                                    cudaMemcpyHostToDevice ) );
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      reduce3int<<<grid_size, block_size,
+      reduce3int<block_size><<<grid_size, block_size,
                    3*sizeof(Int_type)*block_size>>>(vec,
                                                     vmem + 0, m_vsum_init,
                                                     vmem + 1, m_vmin_init,
@@ -159,10 +157,12 @@ void REDUCE3_INT::runCudaVariant(VariantID vid)
     REDUCE3_INT_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(REDUCE3_INT, Cuda)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp
index ba13fa8af..8e92cb123 100644
--- a/src/basic/REDUCE3_INT-Hip.cpp
+++ b/src/basic/REDUCE3_INT-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace basic
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define REDUCE3_INT_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(vec, m_vec, iend);
 
@@ -34,6 +28,8 @@ namespace basic
   deallocHipDeviceData(vec);
 
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void reduce3int(Int_ptr vec,
                            Int_ptr vsum, Int_type vsum_init,
                            Int_ptr vmin, Int_type vmin_init,
@@ -41,23 +37,23 @@ __global__ void reduce3int(Int_ptr vec,
                            Index_type iend)
 {
   HIP_DYNAMIC_SHARED( Int_type, psum)
-  Int_type* pmin = (Int_type*)&psum[ 1 * blockDim.x ];
-  Int_type* pmax = (Int_type*)&psum[ 2 * blockDim.x ];
+  Int_type* pmin = (Int_type*)&psum[ 1 * block_size ];
+  Int_type* pmax = (Int_type*)&psum[ 2 * block_size ];
 
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
 
   psum[ threadIdx.x ] = vsum_init;
   pmin[ threadIdx.x ] = vmin_init;
   pmax[ threadIdx.x ] = vmax_init;
 
-  for ( ; i < iend ; i += gridDim.x * blockDim.x ) {
+  for ( ; i < iend ; i += gridDim.x * block_size ) {
     psum[ threadIdx.x ] += vec[ i ];
     pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], vec[ i ] );
     pmax[ threadIdx.x ] = RAJA_MAX( pmax[ threadIdx.x ], vec[ i ] );
   }
   __syncthreads();
 
-  for ( i = blockDim.x / 2; i > 0; i /= 2 ) {
+  for ( i = block_size / 2; i > 0; i /= 2 ) {
     if ( threadIdx.x < i ) {
       psum[ threadIdx.x ] += psum[ threadIdx.x + i ];
       pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], pmin[ threadIdx.x + i ] );
@@ -82,7 +78,9 @@ __global__ void reduce3int(Int_ptr vec,
 }
 
 
-void REDUCE3_INT::runHipVariant(VariantID vid)
+
+template < size_t block_size >
+void REDUCE3_INT::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -110,7 +108,7 @@ void REDUCE3_INT::runHipVariant(VariantID vid)
                                  hipMemcpyHostToDevice ) );
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((reduce3int), dim3(grid_size), dim3(block_size), 3*sizeof(Int_type)*block_size, 0,
+      hipLaunchKernelGGL((reduce3int<block_size>), dim3(grid_size), dim3(block_size), 3*sizeof(Int_type)*block_size, 0,
                                                     vec,
                                                     vmem + 0, m_vsum_init,
                                                     vmem + 1, m_vmin_init,
@@ -159,10 +157,12 @@ void REDUCE3_INT::runHipVariant(VariantID vid)
     REDUCE3_INT_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  REDUCE3_INT : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  REDUCE3_INT : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(REDUCE3_INT, Hip)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/REDUCE3_INT-OMP.cpp b/src/basic/REDUCE3_INT-OMP.cpp
index f0e853f0a..0f759180b 100644
--- a/src/basic/REDUCE3_INT-OMP.cpp
+++ b/src/basic/REDUCE3_INT-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,13 +13,13 @@
 #include <limits>
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
 
 
-void REDUCE3_INT::runOpenMPVariant(VariantID vid)
+void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -82,7 +82,7 @@ void REDUCE3_INT::runOpenMPVariant(VariantID vid)
         m_vsum += vsum;
         m_vmin = RAJA_MIN(m_vmin, vmin);
         m_vmax = RAJA_MAX(m_vmax, vmax);
-  
+
       }
       stopTimer();
 
@@ -93,7 +93,7 @@ void REDUCE3_INT::runOpenMPVariant(VariantID vid)
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-  
+
         RAJA::ReduceSum<RAJA::omp_reduce, Int_type> vsum(m_vsum_init);
         RAJA::ReduceMin<RAJA::omp_reduce, Int_type> vmin(m_vmin_init);
         RAJA::ReduceMax<RAJA::omp_reduce, Int_type> vmax(m_vmax_init);
@@ -114,12 +114,12 @@ void REDUCE3_INT::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  REDUCE3_INT : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  REDUCE3_INT : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/basic/REDUCE3_INT-OMPTarget.cpp b/src/basic/REDUCE3_INT-OMPTarget.cpp
index ef11b6f5d..7db4bbdd6 100644
--- a/src/basic/REDUCE3_INT-OMPTarget.cpp
+++ b/src/basic/REDUCE3_INT-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
@@ -36,7 +36,7 @@ namespace basic
   deallocOpenMPDeviceData(vec, did); \
 
 
-void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid)
+void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -100,7 +100,7 @@ void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid)
     REDUCE3_INT_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  REDUCE3_INT : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  REDUCE3_INT : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/basic/REDUCE3_INT-Seq.cpp b/src/basic/REDUCE3_INT-Seq.cpp
index 04760c9a5..a3e42cf14 100644
--- a/src/basic/REDUCE3_INT-Seq.cpp
+++ b/src/basic/REDUCE3_INT-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,13 +13,13 @@
 #include <limits>
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
 
 
-void REDUCE3_INT::runSeqVariant(VariantID vid)
+void REDUCE3_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -55,9 +55,9 @@ void REDUCE3_INT::runSeqVariant(VariantID vid)
 #if defined(RUN_RAJA_SEQ)
     case Lambda_Seq : {
 
-      auto init3_base_lam = [=](Index_type i) -> Int_type {
-                              return vec[i];
-                            };
+      auto reduce3_base_lam = [=](Index_type i) -> Int_type {
+                                return vec[i];
+                              };
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -67,9 +67,9 @@ void REDUCE3_INT::runSeqVariant(VariantID vid)
         Int_type vmax = m_vmax_init;
 
         for (Index_type i = ibegin; i < iend; ++i ) {
-          vsum += init3_base_lam(i);
-          vmin = RAJA_MIN(vmin, init3_base_lam(i));
-          vmax = RAJA_MAX(vmax, init3_base_lam(i));
+          vsum += reduce3_base_lam(i);
+          vmin = RAJA_MIN(vmin, reduce3_base_lam(i));
+          vmax = RAJA_MAX(vmax, reduce3_base_lam(i));
         }
 
         m_vsum += vsum;
@@ -108,7 +108,7 @@ void REDUCE3_INT::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  REDUCE3_INT : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  REDUCE3_INT : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp
index 821a4b7e3..dee6d3a5e 100644
--- a/src/basic/REDUCE3_INT.cpp
+++ b/src/basic/REDUCE3_INT.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -62,7 +62,7 @@ REDUCE3_INT::~REDUCE3_INT()
 {
 }
 
-void REDUCE3_INT::setUp(VariantID vid)
+void REDUCE3_INT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitData(m_vec, getActualProblemSize(), vid);
 
@@ -74,14 +74,14 @@ void REDUCE3_INT::setUp(VariantID vid)
   m_vmax_init = std::numeric_limits<Int_type>::min();
 }
 
-void REDUCE3_INT::updateChecksum(VariantID vid)
+void REDUCE3_INT::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += m_vsum;
-  checksum[vid] += m_vmin;
-  checksum[vid] += m_vmax;
+  checksum[vid][tune_idx] += m_vsum;
+  checksum[vid][tune_idx] += m_vmin;
+  checksum[vid][tune_idx] += m_vmax;
 }
 
-void REDUCE3_INT::tearDown(VariantID vid)
+void REDUCE3_INT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_vec);
diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp
index b3acc5004..93ad766c2 100644
--- a/src/basic/REDUCE3_INT.hpp
+++ b/src/basic/REDUCE3_INT.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -61,17 +61,27 @@ class REDUCE3_INT : public KernelBase
 
   ~REDUCE3_INT();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Int_ptr m_vec;
   Int_type m_vsum;
   Int_type m_vsum_init;
diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp
new file mode 100644
index 000000000..52bec116d
--- /dev/null
+++ b/src/basic/REDUCE_STRUCT-Cuda.cpp
@@ -0,0 +1,199 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "REDUCE_STRUCT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+  
+#define REDUCE_STRUCT_DATA_SETUP_CUDA \
+  allocAndInitCudaDeviceData(points.x, m_x, points.N); \
+  allocAndInitCudaDeviceData(points.y, m_y, points.N); \
+  
+
+#define REDUCE_STRUCT_DATA_TEARDOWN_CUDA \
+  deallocCudaDeviceData(points.x); \
+  deallocCudaDeviceData(points.y);
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void reduce_struct(Real_ptr x, Real_ptr y,
+                              Real_ptr xsum, Real_ptr xmin, Real_ptr xmax, 
+                              Real_ptr ysum, Real_ptr ymin, Real_ptr ymax, 
+                              Real_type m_init_sum, 
+                              Real_type m_init_min, 
+                              Real_type m_init_max, 
+                              Index_type iend)
+{
+
+  //x
+  extern __shared__ Real_type shared[];
+  Real_type* pxsum = (Real_type*)&shared[ 0 * blockDim.x ];
+  Real_type* pxmin = (Real_type*)&shared[ 1 * blockDim.x ];
+  Real_type* pxmax = (Real_type*)&shared[ 2 * blockDim.x ];
+  //y
+  Real_type* pysum = (Real_type*)&shared[ 3 * blockDim.x ];
+  Real_type* pymin = (Real_type*)&shared[ 4 * blockDim.x ];
+  Real_type* pymax = (Real_type*)&shared[ 5 * blockDim.x ];
+
+  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  //x
+  pxsum[ threadIdx.x ] = m_init_sum;
+  pxmin[ threadIdx.x ] = m_init_min;
+  pxmax[ threadIdx.x ] = m_init_max;
+  //y
+  pysum[ threadIdx.x ] = m_init_sum;
+  pymin[ threadIdx.x ] = m_init_min;
+  pymax[ threadIdx.x ] = m_init_max;
+
+
+  for ( ; i < iend ; i += gridDim.x * blockDim.x ) {
+	//x
+    pxsum[ threadIdx.x ] += x[ i ];
+    pxmin[ threadIdx.x ] = RAJA_MIN( pxmin[ threadIdx.x ], x[ i ] );
+    pxmax[ threadIdx.x ] = RAJA_MAX( pxmax[ threadIdx.x ], x[ i ] );
+	//y
+    pysum[ threadIdx.x ] += y[ i ];
+    pymin[ threadIdx.x ] = RAJA_MIN( pymin[ threadIdx.x ], y[ i ] );
+    pymax[ threadIdx.x ] = RAJA_MAX( pymax[ threadIdx.x ], y[ i ] );
+
+  }
+  __syncthreads();
+
+  for ( i = blockDim.x / 2; i > 0; i /= 2 ) {
+    if ( threadIdx.x < i ) {
+	  //x
+      pxsum[ threadIdx.x ] += pxsum[ threadIdx.x + i ];
+      pxmin[ threadIdx.x ] = RAJA_MIN( pxmin[ threadIdx.x ], pxmin[ threadIdx.x + i ] );
+      pxmax[ threadIdx.x ] = RAJA_MAX( pxmax[ threadIdx.x ], pxmax[ threadIdx.x + i ] );
+	  //y
+      pysum[ threadIdx.x ] += pysum[ threadIdx.x + i ];
+      pymin[ threadIdx.x ] = RAJA_MIN( pymin[ threadIdx.x ], pymin[ threadIdx.x + i ] );
+      pymax[ threadIdx.x ] = RAJA_MAX( pymax[ threadIdx.x ], pymax[ threadIdx.x + i ] );
+
+    }
+     __syncthreads();
+  }
+
+// serialized access to shared data;
+  if ( threadIdx.x == 0 ) {
+    RAJA::atomicAdd<RAJA::cuda_atomic>( xsum, pxsum[ 0 ] );
+    RAJA::atomicMin<RAJA::cuda_atomic>( xmin, pxmin[ 0 ] );
+    RAJA::atomicMax<RAJA::cuda_atomic>( xmax, pxmax[ 0 ] );
+
+    RAJA::atomicAdd<RAJA::cuda_atomic>( xsum, pysum[ 0 ] );
+    RAJA::atomicMin<RAJA::cuda_atomic>( ymin, pymin[ 0 ] );
+    RAJA::atomicMax<RAJA::cuda_atomic>( ymax, pymax[ 0 ] );
+  }
+}
+
+template < size_t block_size >
+void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  REDUCE_STRUCT_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    REDUCE_STRUCT_DATA_SETUP_CUDA;
+
+    Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax
+    allocCudaDeviceData(mem,6);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      cudaErrchk(cudaMemsetAsync(mem, 0.0, 6*sizeof(Real_type)));  
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+                                                            
+      reduce_struct<block_size><<<grid_size, block_size,
+                                  6*sizeof(Real_type)*block_size>>>(
+        points.x, points.y,
+        mem, mem+1, mem+2,    // xcenter,xmin,xmax
+        mem+3, mem+4, mem+5,  // ycenter,ymin,ymax
+        m_init_sum, m_init_min, m_init_max,
+        points.N);
+      cudaErrchk( cudaGetLastError() );
+
+      Real_type lmem[6]={0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+      Real_ptr plmem = &lmem[0];
+      getCudaDeviceData(plmem, mem, 6);
+
+      points.SetCenter(lmem[0]/points.N, lmem[3]/points.N);
+      points.SetXMin(lmem[1]);
+      points.SetXMax(lmem[2]);
+      points.SetYMin(lmem[4]);
+      points.SetYMax(lmem[5]);
+      m_points=points;
+
+    }
+    stopTimer();
+
+    REDUCE_STRUCT_DATA_TEARDOWN_CUDA;
+
+    deallocCudaDeviceData(mem);
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    REDUCE_STRUCT_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::ReduceSum<RAJA::cuda_reduce, Real_type> xsum(m_init_sum);
+      RAJA::ReduceSum<RAJA::cuda_reduce, Real_type> ysum(m_init_sum);
+      RAJA::ReduceMin<RAJA::cuda_reduce, Real_type> xmin(m_init_min); 
+      RAJA::ReduceMin<RAJA::cuda_reduce, Real_type> ymin(m_init_min);
+      RAJA::ReduceMax<RAJA::cuda_reduce, Real_type> xmax(m_init_max); 
+      RAJA::ReduceMax<RAJA::cuda_reduce, Real_type> ymax(m_init_max);
+
+      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+        RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+          REDUCE_STRUCT_BODY_RAJA;
+      });
+
+      points.SetCenter((xsum.get()/(points.N)),
+                       (ysum.get()/(points.N)));
+      points.SetXMin((xmin.get())); 
+      points.SetXMax((xmax.get()));
+      points.SetYMin((ymin.get())); 
+      points.SetYMax((ymax.get()));
+      m_points=points;
+
+    }
+    stopTimer();
+
+    REDUCE_STRUCT_DATA_TEARDOWN_CUDA;
+
+  } else {
+     getCout() << "\n  REDUCE_STRUCT : Unknown CUDA variant id = " << vid << std::endl;
+  }
+
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(REDUCE_STRUCT, Cuda)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp
new file mode 100644
index 000000000..f72306107
--- /dev/null
+++ b/src/basic/REDUCE_STRUCT-Hip.cpp
@@ -0,0 +1,201 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "REDUCE_STRUCT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#include "common/HipDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+
+#define REDUCE_STRUCT_DATA_SETUP_HIP \
+  allocAndInitHipDeviceData(points.x, m_x, points.N); \
+  allocAndInitHipDeviceData(points.y, m_y, points.N); \
+  
+#define REDUCE_STRUCT_DATA_TEARDOWN_HIP \
+  deallocHipDeviceData(points.x); \
+  deallocHipDeviceData(points.y); 
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void reduce_struct(Real_ptr x, Real_ptr y,
+                              Real_ptr xsum, Real_ptr xmin, Real_ptr xmax, 
+                              Real_ptr ysum, Real_ptr ymin, Real_ptr ymax, 
+                              Real_type m_init_sum,
+                              Real_type m_init_min,
+                              Real_type m_init_max,
+                              Index_type iend)
+{
+
+  //x
+  HIP_DYNAMIC_SHARED( Real_type, shared)
+  Real_type* pxsum = (Real_type*)&shared[ 0 * blockDim.x ];
+  Real_type* pxmin = (Real_type*)&shared[ 1 * blockDim.x ];
+  Real_type* pxmax = (Real_type*)&shared[ 2 * blockDim.x ];
+  //y
+  Real_type* pysum = (Real_type*)&shared[ 3 * blockDim.x ];
+  Real_type* pymin = (Real_type*)&shared[ 4 * blockDim.x ];
+  Real_type* pymax = (Real_type*)&shared[ 5 * blockDim.x ];
+
+  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  //x
+  pxsum[ threadIdx.x ] = m_init_sum;
+  pxmin[ threadIdx.x ] = m_init_min;
+  pxmax[ threadIdx.x ] = m_init_max;
+  //y
+  pysum[ threadIdx.x ] = m_init_sum;
+  pymin[ threadIdx.x ] = m_init_min;
+  pymax[ threadIdx.x ] = m_init_max;
+
+
+  for ( ; i < iend ; i += gridDim.x * blockDim.x ) {
+	//x
+    pxsum[ threadIdx.x ] += x[ i ];
+    pxmin[ threadIdx.x ] = RAJA_MIN( pxmin[ threadIdx.x ], x[ i ] );
+    pxmax[ threadIdx.x ] = RAJA_MAX( pxmax[ threadIdx.x ], x[ i ] );
+	//y
+    pysum[ threadIdx.x ] += y[ i ];
+    pymin[ threadIdx.x ] = RAJA_MIN( pymin[ threadIdx.x ], y[ i ] );
+    pymax[ threadIdx.x ] = RAJA_MAX( pymax[ threadIdx.x ], y[ i ] );
+
+  }
+  __syncthreads();
+
+  for ( i = blockDim.x / 2; i > 0; i /= 2 ) {
+    if ( threadIdx.x < i ) {
+	  //x
+      pxsum[ threadIdx.x ] += pxsum[ threadIdx.x + i ];
+      pxmin[ threadIdx.x ] = RAJA_MIN( pxmin[ threadIdx.x ], pxmin[ threadIdx.x + i ] );
+      pxmax[ threadIdx.x ] = RAJA_MAX( pxmax[ threadIdx.x ], pxmax[ threadIdx.x + i ] );
+	  //y
+      pysum[ threadIdx.x ] += pysum[ threadIdx.x + i ];
+      pymin[ threadIdx.x ] = RAJA_MIN( pymin[ threadIdx.x ], pymin[ threadIdx.x + i ] );
+      pymax[ threadIdx.x ] = RAJA_MAX( pymax[ threadIdx.x ], pymax[ threadIdx.x + i ] );
+
+    }
+     __syncthreads();
+  }
+
+// serialized access to shared data;
+  if ( threadIdx.x == 0 ) {
+    RAJA::atomicAdd<RAJA::hip_atomic>( xsum, pxsum[ 0 ] );
+    RAJA::atomicMin<RAJA::hip_atomic>( xmin, pxmin[ 0 ] );
+    RAJA::atomicMax<RAJA::hip_atomic>( xmax, pxmax[ 0 ] );
+
+    RAJA::atomicAdd<RAJA::hip_atomic>( ysum, pysum[ 0 ] );
+    RAJA::atomicMin<RAJA::hip_atomic>( ymin, pymin[ 0 ] );
+    RAJA::atomicMax<RAJA::hip_atomic>( ymax, pymax[ 0 ] );
+  }
+}
+
+
+template < size_t block_size >
+void REDUCE_STRUCT::runHipVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  REDUCE_STRUCT_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    REDUCE_STRUCT_DATA_SETUP_HIP;
+
+    Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax
+    allocHipDeviceData(mem,6);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      hipErrchk(hipMemsetAsync(mem, 0.0, 6*sizeof(Real_type)));
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+
+      hipLaunchKernelGGL((reduce_struct<block_size>), 
+                         dim3(grid_size), dim3(block_size), 
+                         6*sizeof(Real_type)*block_size, 0,
+	                 points.x, points.y,
+                         mem, mem+1, mem+2,    // xcenter,xmin,xmax
+                         mem+3, mem+4, mem+5,  // ycenter,ymin,ymax
+                         m_init_sum, m_init_min, m_init_max,
+                         points.N);
+      hipErrchk( hipGetLastError() );
+
+      Real_type lmem[6]={0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+      Real_ptr plmem = &lmem[0];
+      getHipDeviceData(plmem, mem, 6);
+
+      points.SetCenter(lmem[0]/points.N, lmem[3]/points.N);
+      points.SetXMin(lmem[1]);
+      points.SetXMax(lmem[2]);
+      points.SetYMin(lmem[4]);
+      points.SetYMax(lmem[5]);
+      m_points=points;
+
+    }
+    stopTimer();
+
+    REDUCE_STRUCT_DATA_TEARDOWN_HIP;
+
+    deallocHipDeviceData(mem);
+
+  } else if ( vid == RAJA_HIP ) {
+
+    REDUCE_STRUCT_DATA_SETUP_HIP;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::ReduceSum<RAJA::hip_reduce, Real_type> xsum(m_init_sum);
+      RAJA::ReduceSum<RAJA::hip_reduce, Real_type> ysum(m_init_sum);
+      RAJA::ReduceMin<RAJA::hip_reduce, Real_type> xmin(m_init_min);
+      RAJA::ReduceMin<RAJA::hip_reduce, Real_type> ymin(m_init_min);
+      RAJA::ReduceMax<RAJA::hip_reduce, Real_type> xmax(m_init_max);
+      RAJA::ReduceMax<RAJA::hip_reduce, Real_type> ymax(m_init_max);
+
+      RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >(
+        RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+          REDUCE_STRUCT_BODY_RAJA;
+      });
+
+      points.SetCenter((xsum.get()/(points.N)),
+                       (ysum.get()/(points.N)));
+      points.SetXMin((xmin.get())); 
+      points.SetXMax((xmax.get()));
+      points.SetYMin((ymin.get())); 
+      points.SetYMax((ymax.get()));
+      m_points=points;
+
+    }
+    stopTimer();
+
+    REDUCE_STRUCT_DATA_TEARDOWN_HIP;
+
+  } else {
+     getCout() << "\n  REDUCE_STRUCT : Unknown Hip variant id = " << vid << std::endl;
+  }
+
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(REDUCE_STRUCT, Hip)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_HIP
diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp
new file mode 100644
index 000000000..ec60e6919
--- /dev/null
+++ b/src/basic/REDUCE_STRUCT-OMP.cpp
@@ -0,0 +1,154 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "REDUCE_STRUCT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <limits>
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace basic
+{
+
+
+void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  REDUCE_STRUCT_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_OpenMP : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        Real_type xsum = m_init_sum; Real_type ysum = m_init_sum;
+        Real_type xmin = m_init_min; Real_type ymin = m_init_min;
+        Real_type xmax = m_init_max; Real_type ymax = m_init_max;
+
+        #pragma omp parallel for reduction(+:xsum), \
+                                 reduction(min:xmin), \
+                                 reduction(max:xmax), \
+                                 reduction(+:ysum), \
+                                 reduction(min:ymin), \
+                                 reduction(max:ymax)
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          REDUCE_STRUCT_BODY;
+        }
+
+        points.SetCenter(xsum/points.N, ysum/points.N);
+        points.SetXMin(xmin); 
+        points.SetXMax(xmax);
+        points.SetYMin(ymin); 
+        points.SetYMax(ymax);
+        m_points=points;
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case Lambda_OpenMP : {
+
+      auto reduce_struct_x_base_lam = [=](Index_type i) -> Real_type {
+                                   return points.x[i];
+                                 };
+
+      auto reduce_struct_y_base_lam = [=](Index_type i) -> Real_type {
+                                   return points.y[i];
+                                 };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        Real_type xsum = m_init_sum; Real_type ysum = m_init_sum;
+        Real_type xmin = m_init_min; Real_type ymin = m_init_min;
+        Real_type xmax = m_init_max; Real_type ymax = m_init_max;
+
+        #pragma omp parallel for reduction(+:xsum), \
+                                 reduction(min:xmin), \
+                                 reduction(max:xmax), \
+                                 reduction(+:ysum), \
+                                 reduction(min:ymin), \
+                                 reduction(max:ymax)
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          xsum += reduce_struct_x_base_lam(i);
+          xmin = RAJA_MIN(xmin, reduce_struct_x_base_lam(i));
+          xmax = RAJA_MAX(xmax, reduce_struct_x_base_lam(i));
+          ysum += reduce_struct_y_base_lam(i);
+          ymin = RAJA_MIN(ymin, reduce_struct_y_base_lam(i));
+          ymax = RAJA_MAX(ymax, reduce_struct_y_base_lam(i));
+        }
+
+        points.SetCenter(xsum/points.N, ysum/points.N);
+        points.SetXMin(xmin); 
+        points.SetXMax(xmax);
+        points.SetYMin(ymin);
+        points.SetYMax(ymax);
+        m_points=points;
+
+      } 
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_OpenMP : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+ 
+        RAJA::ReduceSum<RAJA::omp_reduce, Real_type> xsum(m_init_sum);
+        RAJA::ReduceSum<RAJA::omp_reduce, Real_type> ysum(m_init_sum);
+        RAJA::ReduceMin<RAJA::omp_reduce, Real_type> xmin(m_init_min); 
+        RAJA::ReduceMin<RAJA::omp_reduce, Real_type> ymin(m_init_min);
+        RAJA::ReduceMax<RAJA::omp_reduce, Real_type> xmax(m_init_max); 
+        RAJA::ReduceMax<RAJA::omp_reduce, Real_type> ymax(m_init_max);
+
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+          REDUCE_STRUCT_BODY_RAJA;
+        });
+
+        points.SetCenter((xsum.get()/(points.N)),
+                         (ysum.get()/(points.N)));
+        points.SetXMin((xmin.get())); 
+        points.SetXMax((xmax.get()));
+        points.SetYMin((ymin.get())); 
+        points.SetYMax((ymax.get()));
+        m_points=points;
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    default : {
+      std::cout << "\n  REDUCE_STRUCT : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+#else 
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/REDUCE_STRUCT-OMPTarget.cpp b/src/basic/REDUCE_STRUCT-OMPTarget.cpp
new file mode 100644
index 000000000..baa2b67d2
--- /dev/null
+++ b/src/basic/REDUCE_STRUCT-OMPTarget.cpp
@@ -0,0 +1,126 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "REDUCE_STRUCT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace basic
+{
+
+  //
+  // Define threads per team for target execution
+  //
+  const size_t threads_per_team = 256;
+
+#define REDUCE_STRUCT_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  allocAndInitHipDeviceData(points.x, m_x, points.N, did, hid); \
+  allocAndInitHipDeviceData(points.y, m_y, points.N, did, hid); 
+
+#define REDUCE_STRUCT_DATA_TEARDOWN_OMP_TARGET \
+  deallocHipDeviceData(points.x); \
+  deallocHipDeviceData(points.y); \
+
+
+void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  REDUCE_STRUCT_DATA_SETUP;
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    REDUCE_STRUCT_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Real_type xsum = m_init_sum; Real_type ysum = m_init_sum;
+      Real_type xmin = m_init_min; Real_type ymin = m_init_min;
+      Real_type xmax = m_init_max; Real_type ymax = m_init_max;
+
+      #pragma omp target is_device_ptr(vec) device( did ) map(tofrom:xsum, xmin, xmax, ysum, ymin, ymax)
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static,1) \
+                               reduction(+:xsum) \
+                               reduction(min:xmin) \
+                               reduction(max:xmax), \
+                               reduction(+:ysum), \
+                               reduction(min:ymin), \
+                               reduction(max:ymax)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        REDUCE_STRUCT_BODY;
+      }
+
+      points.SetCenter(xsum/points.N, ysum/points.N);
+      points.SetXMin(xmin);
+      points.SetXMax(xmax);
+      points.SetYMin(ymin); 
+      points.SetYMax(ymax);
+      m_points=points;
+
+    }
+    stopTimer();
+
+    REDUCE_STRUCT_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    REDUCE_STRUCT_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::ReduceSum<RAJA::omp_target_reduce, Real_type> xsum(m_init_sum);
+      RAJA::ReduceSum<RAJA::omp_target_reduce, Real_type> ysum(m_init_sum);
+      RAJA::ReduceMin<RAJA::omp_target_reduce, Real_type> xmin(m_init_min);
+      RAJA::ReduceMin<RAJA::omp_target_reduce, Real_type> ymin(m_init_min);
+      RAJA::ReduceMax<RAJA::omp_target_reduce, Real_type> xmax(m_init_max);
+      RAJA::ReduceMax<RAJA::omp_target_reduce, Real_type> ymax(m_init_max);
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+        RAJA::RangeSegment(ibegin, iend),
+        [=](Index_type i) {
+        REDUCE_STRUCT_BODY_RAJA;
+      });
+
+      points.SetCenter(xsum.get()/(points.N),
+                       ysum.get()/(points.N));
+      points.SetXMin(xmin.get());
+      points.SetXMax(xmax.get());
+      points.SetYMin(ymin.get());
+      points.SetYMax(ymax.get());
+      m_points=points;
+
+    }
+    stopTimer();
+
+    REDUCE_STRUCT_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  REDUCE_STRUCT : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/basic/REDUCE_STRUCT-Seq.cpp b/src/basic/REDUCE_STRUCT-Seq.cpp
new file mode 100644
index 000000000..71fe7a471
--- /dev/null
+++ b/src/basic/REDUCE_STRUCT-Seq.cpp
@@ -0,0 +1,139 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "REDUCE_STRUCT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <limits>
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace basic
+{
+
+
+void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  REDUCE_STRUCT_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+ 
+        Real_type xsum = m_init_sum; Real_type ysum = m_init_sum;
+        Real_type xmin = m_init_min; Real_type ymin = m_init_min;
+        Real_type xmax = m_init_max; Real_type ymax = m_init_max;
+
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          REDUCE_STRUCT_BODY;
+        }
+
+        points.SetCenter(xsum/(points.N), ysum/(points.N));
+        points.SetXMin(xmin); 
+        points.SetXMax(xmax);
+        points.SetYMin(ymin);
+        points.SetYMax(ymax);
+        m_points=points;
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+#if defined(RUN_RAJA_SEQ)
+    case Lambda_Seq : {
+
+      auto reduce_struct_x_base_lam = [=](Index_type i) -> Real_type {
+                                   return points.x[i];
+                                 };
+
+      auto reduce_struct_y_base_lam = [=](Index_type i) -> Real_type {
+                                   return points.y[i];
+                                 };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        Real_type xsum = m_init_sum; Real_type ysum = m_init_sum;
+        Real_type xmin = m_init_min; Real_type ymin = m_init_min;
+        Real_type xmax = m_init_max; Real_type ymax = m_init_max; 
+
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          xsum += reduce_struct_x_base_lam(i);
+          xmin = RAJA_MIN(xmin, reduce_struct_x_base_lam(i));
+          xmax = RAJA_MAX(xmax, reduce_struct_x_base_lam(i));
+          ysum += reduce_struct_y_base_lam(i);
+          ymin = RAJA_MIN(ymin, reduce_struct_y_base_lam(i));
+          ymax = RAJA_MAX(ymax, reduce_struct_y_base_lam(i));
+        }
+
+        points.SetCenter(xsum/(points.N), ysum/(points.N));
+        points.SetXMin(xmin); 
+        points.SetXMax(xmax);
+        points.SetYMin(ymin); 
+        points.SetYMax(ymax);
+        m_points=points;
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::ReduceSum<RAJA::seq_reduce, Real_type> xsum(m_init_sum);
+        RAJA::ReduceSum<RAJA::seq_reduce, Real_type> ysum(m_init_sum);
+        RAJA::ReduceMin<RAJA::seq_reduce, Real_type> xmin(m_init_min);
+        RAJA::ReduceMin<RAJA::seq_reduce, Real_type> ymin(m_init_min);
+        RAJA::ReduceMax<RAJA::seq_reduce, Real_type> xmax(m_init_max);
+        RAJA::ReduceMax<RAJA::seq_reduce, Real_type> ymax(m_init_max);
+
+        RAJA::forall<RAJA::loop_exec>(
+        RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+        REDUCE_STRUCT_BODY_RAJA;
+        });
+
+      	points.SetCenter(xsum.get()/(points.N),
+                         ysum.get()/(points.N));
+        points.SetXMin(xmin.get());
+        points.SetXMax(xmax.get());
+        points.SetYMin(ymin.get());
+        points.SetYMax(ymax.get());
+        m_points=points;
+
+      }
+      stopTimer();
+
+      break;
+    }
+#endif // RUN_RAJA_SEQ
+
+    default : {
+      std::cout << "\n  REDUCE_STRUCT : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp
new file mode 100644
index 000000000..d5c33f906
--- /dev/null
+++ b/src/basic/REDUCE_STRUCT.cpp
@@ -0,0 +1,100 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "REDUCE_STRUCT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "common/DataUtils.hpp"
+
+#include <limits>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+
+REDUCE_STRUCT::REDUCE_STRUCT(const RunParams& params)
+  : KernelBase(rajaperf::Basic_REDUCE_STRUCT, params)
+{
+  setDefaultProblemSize(1000000);
+//setDefaultReps(5000);
+// Set reps to low value until we resolve RAJA omp-target
+// reduction performance issues
+  setDefaultReps(50);
+
+  setActualProblemSize( getTargetProblemSize() );
+
+  setItsPerRep( getActualProblemSize() );
+  setKernelsPerRep(1);
+  setBytesPerRep( 6*sizeof(Real_type) + 2*sizeof(Real_type)*getActualProblemSize());
+  setFLOPsPerRep(2 * getActualProblemSize() + 2);
+    
+
+  setUsesFeature(Forall);
+  setUsesFeature(Reduction);
+
+  setVariantDefined( Base_Seq );
+  setVariantDefined( Lambda_Seq );
+  setVariantDefined( RAJA_Seq );
+
+  setVariantDefined( Base_OpenMP );
+  setVariantDefined( Lambda_OpenMP );
+  setVariantDefined( RAJA_OpenMP );
+
+  setVariantDefined( Base_OpenMPTarget );
+  setVariantDefined( RAJA_OpenMPTarget );
+
+  setVariantDefined( Base_CUDA );
+  setVariantDefined( RAJA_CUDA );
+
+  setVariantDefined( Base_HIP );
+  setVariantDefined( RAJA_HIP );
+}
+
+REDUCE_STRUCT::~REDUCE_STRUCT()
+{
+}
+
+void REDUCE_STRUCT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  m_init_sum = 0.0;
+  m_init_min = std::numeric_limits<Real_type>::max();
+  m_init_max = std::numeric_limits<Real_type>::lowest();
+  allocAndInitData(m_x, getActualProblemSize(), vid);
+  allocAndInitData(m_y, getActualProblemSize(), vid);
+  Real_type dx = Lx/(Real_type)(getActualProblemSize());
+  Real_type dy = Ly/(Real_type)(getActualProblemSize());
+  for (int i=0;i<getActualProblemSize();i++){ \
+      m_x[i] = i*dx;  
+      m_y[i] = i*dy; 
+  } 
+}
+
+void REDUCE_STRUCT::updateChecksum(VariantID vid, size_t tune_idx)
+{
+  checksum[vid][tune_idx] += m_points.GetCenter()[0];
+  checksum[vid][tune_idx] += m_points.GetXMin();
+  checksum[vid][tune_idx] += m_points.GetXMax();
+  checksum[vid][tune_idx] += m_points.GetCenter()[1];
+  checksum[vid][tune_idx] += m_points.GetYMin();
+  checksum[vid][tune_idx] += m_points.GetYMax();
+
+  return;
+}
+
+void REDUCE_STRUCT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  (void) vid;
+  deallocData(m_x);
+  deallocData(m_y);
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp
new file mode 100644
index 000000000..b1d188ca1
--- /dev/null
+++ b/src/basic/REDUCE_STRUCT.hpp
@@ -0,0 +1,136 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// REDUCE_STRUCT kernel reference implementation:
+///
+/// Real_type xsum = m_sum_init; Real_type ysum = m_sum_init;
+/// Real_type xmin = m_min_init; Real_type ymin = m_min_init;
+/// Real_type xmax = m_max_init; Real_type ymax = m_max_init;
+
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   xsum += x[i] ; ysum += y[i] ;
+///   xmin = RAJA_MIN(xmin, x[i]) ; xmax = RAJA_MAX(xmax, x[i]) ;
+///   ymin = RAJA_MIN(ymin, y[i]) ; ymax = RAJA_MAX(ymax, y[i]) ;
+/// }
+///
+/// points.xcenter = xsum;
+/// points.xcenter /= points.N
+/// points.xmin = xmin;
+/// points.xmax = xmax;
+/// points.ycenter = ysum;
+/// points.ycenter /= points.N
+/// points.ymin = ymin;
+/// points.ymax = ymax;
+
+///
+/// RAJA_MIN/MAX are macros that do what you would expect.
+///
+
+#ifndef RAJAPerf_Basic_REDUCE_STRUCT_HPP
+#define RAJAPerf_Basic_REDUCE_STRUCT_HPP
+
+
+#define REDUCE_STRUCT_DATA_SETUP \
+  points points; \
+  points.N = getActualProblemSize(); \
+  points.x = m_x; \
+  points.y = m_y; \
+
+#define REDUCE_STRUCT_BODY  \
+  xsum += points.x[i] ; \
+  xmin = RAJA_MIN(xmin, points.x[i]) ; \
+  xmax = RAJA_MAX(xmax, points.x[i]) ; \
+  ysum += points.y[i] ; \
+  ymin = RAJA_MIN(ymin, points.y[i]) ; \
+  ymax = RAJA_MAX(ymax, points.y[i]) ;
+
+#define REDUCE_STRUCT_BODY_RAJA  \
+  xsum += points.x[i] ; \
+  xmin.min(points.x[i]) ; \
+  xmax.max(points.x[i]) ; \
+  ysum += points.y[i] ; \
+  ymin.min(points.y[i]) ; \
+  ymax.max(points.y[i]) ;
+
+
+#include "common/KernelBase.hpp"
+
+namespace rajaperf
+{
+class RunParams;
+
+namespace basic
+{
+
+class REDUCE_STRUCT : public KernelBase
+{
+public:
+
+  REDUCE_STRUCT(const RunParams& params);
+
+  ~REDUCE_STRUCT();
+
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
+
+  struct points{
+    Int_type N;
+    Real_ptr x, y;
+
+    Real_ptr GetCenter(){return &center[0];};
+    Real_type GetXMax(){return xmax;};
+    Real_type GetXMin(){return xmin;};
+    Real_type GetYMax(){return ymax;};
+    Real_type GetYMin(){return ymin;};
+    void SetCenter(Real_type xval, Real_type yval){this->center[0]=xval, this->center[1]=yval;};
+    void SetXMin(Real_type val){this->xmin=val;};
+    void SetXMax(Real_type val){this->xmax=val;};
+    void SetYMin(Real_type val){this->ymin=val;};
+    void SetYMax(Real_type val){this->ymax=val;};              
+        
+    //results
+    private:
+    Real_type center[2] = {0.0,0.0};
+    Real_type xmin, xmax;
+    Real_type ymin, ymax;
+    }; 
+private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  Real_ptr m_x; Real_ptr m_y;
+  Real_type	m_init_sum; 
+  Real_type	m_init_min; 
+  Real_type	m_init_max; 
+  points m_points;
+  Real_type X_MIN = 0.0, X_MAX = 100.0; 
+  Real_type Y_MIN = 0.0, Y_MAX = 50.0; 
+  Real_type Lx = (X_MAX) - (X_MIN); 
+  Real_type Ly = (Y_MAX) - (Y_MIN);
+ 
+};
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard
diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp
index 305104c4a..d2845cbfd 100644
--- a/src/basic/TRAP_INT-Cuda.cpp
+++ b/src/basic/TRAP_INT-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -37,17 +37,13 @@ Real_type trap_int_func(Real_type x,
 }
 
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define TRAP_INT_DATA_SETUP_CUDA  // nothing to do here...
 
 #define TRAP_INT_DATA_TEARDOWN_CUDA // nothing to do here...
 
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void trapint(Real_type x0, Real_type xp,
                         Real_type y, Real_type yp,
                         Real_type h,
@@ -56,17 +52,17 @@ __global__ void trapint(Real_type x0, Real_type xp,
 {
   extern __shared__ Real_type psumx[ ];
 
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
 
   psumx[ threadIdx.x ] = 0.0;
-  for ( ; i < iend ; i += gridDim.x * blockDim.x ) {
+  for ( ; i < iend ; i += gridDim.x * block_size ) {
     Real_type x = x0 + i*h;
     Real_type val = trap_int_func(x, y, xp, yp);
     psumx[ threadIdx.x ] += val;
   }
   __syncthreads();
 
-  for ( i = blockDim.x / 2; i > 0; i /= 2 ) {
+  for ( i = block_size / 2; i > 0; i /= 2 ) {
     if ( threadIdx.x < i ) {
       psumx[ threadIdx.x ] += psumx[ threadIdx.x + i ];
     }
@@ -86,7 +82,9 @@ __global__ void trapint(Real_type x0, Real_type xp,
 }
 
 
-void TRAP_INT::runCudaVariant(VariantID vid)
+
+template < size_t block_size >
+void TRAP_INT::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -107,7 +105,7 @@ void TRAP_INT::runCudaVariant(VariantID vid)
       initCudaDeviceData(sumx, &m_sumx_init, 1);
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      trapint<<<grid_size, block_size,
+      trapint<block_size><<<grid_size, block_size,
                 sizeof(Real_type)*block_size>>>(x0, xp,
                                                 y, yp,
                                                 h,
@@ -149,10 +147,12 @@ void TRAP_INT::runCudaVariant(VariantID vid)
     TRAP_INT_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  TRAP_INT : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  TRAP_INT : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(TRAP_INT, Cuda)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp
index 40e6158bb..63101962f 100644
--- a/src/basic/TRAP_INT-Hip.cpp
+++ b/src/basic/TRAP_INT-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -37,17 +37,13 @@ Real_type trap_int_func(Real_type x,
 }
 
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define TRAP_INT_DATA_SETUP_HIP // nothing to do here...
 
 #define TRAP_INT_DATA_TEARDOWN_HIP // nothing to do here...
 
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void trapint(Real_type x0, Real_type xp,
                         Real_type y, Real_type yp,
                         Real_type h,
@@ -56,17 +52,17 @@ __global__ void trapint(Real_type x0, Real_type xp,
 {
   HIP_DYNAMIC_SHARED( Real_type, psumx)
 
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
 
   psumx[ threadIdx.x ] = 0.0;
-  for ( ; i < iend ; i += gridDim.x * blockDim.x ) {
+  for ( ; i < iend ; i += gridDim.x * block_size ) {
     Real_type x = x0 + i*h;
     Real_type val = trap_int_func(x, y, xp, yp);
     psumx[ threadIdx.x ] += val;
   }
   __syncthreads();
 
-  for ( i = blockDim.x / 2; i > 0; i /= 2 ) {
+  for ( i = block_size / 2; i > 0; i /= 2 ) {
     if ( threadIdx.x < i ) {
       psumx[ threadIdx.x ] += psumx[ threadIdx.x + i ];
     }
@@ -86,7 +82,9 @@ __global__ void trapint(Real_type x0, Real_type xp,
 }
 
 
-void TRAP_INT::runHipVariant(VariantID vid)
+
+template < size_t block_size >
+void TRAP_INT::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -107,7 +105,7 @@ void TRAP_INT::runHipVariant(VariantID vid)
       initHipDeviceData(sumx, &m_sumx_init, 1);
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((trapint), dim3(grid_size), dim3(block_size), sizeof(Real_type)*block_size, 0, x0, xp,
+      hipLaunchKernelGGL((trapint<block_size>), dim3(grid_size), dim3(block_size), sizeof(Real_type)*block_size, 0, x0, xp,
                                                 y, yp,
                                                 h,
                                                 sumx,
@@ -148,10 +146,12 @@ void TRAP_INT::runHipVariant(VariantID vid)
     TRAP_INT_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  TRAP_INT : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  TRAP_INT : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(TRAP_INT, Hip)
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/TRAP_INT-OMP.cpp b/src/basic/TRAP_INT-OMP.cpp
index cfc449c77..bd4c3c24b 100644
--- a/src/basic/TRAP_INT-OMP.cpp
+++ b/src/basic/TRAP_INT-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,7 +12,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
@@ -32,7 +32,7 @@ Real_type trap_int_func(Real_type x,
 }
 
 
-void TRAP_INT::runOpenMPVariant(VariantID vid)
+void TRAP_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -110,12 +110,12 @@ void TRAP_INT::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  TRAP_INT : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  TRAP_INT : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/basic/TRAP_INT-OMPTarget.cpp b/src/basic/TRAP_INT-OMPTarget.cpp
index 636f4090a..53dab376a 100644
--- a/src/basic/TRAP_INT-OMPTarget.cpp
+++ b/src/basic/TRAP_INT-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
@@ -46,7 +46,7 @@ Real_type trap_int_func(Real_type x,
 #define TRAP_INT_DATA_TEARDOWN_OMP_TARGET // nothing to do here...
 
 
-void TRAP_INT::runOpenMPTargetVariant(VariantID vid)
+void TRAP_INT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -66,8 +66,8 @@ void TRAP_INT::runOpenMPTargetVariant(VariantID vid)
       Real_type sumx = m_sumx_init;
 
       #pragma omp target teams distribute parallel for map(tofrom: sumx) reduction(+:sumx) \
-                         thread_limit(threads_per_team) schedule(static, 1) 
-        
+                         thread_limit(threads_per_team) schedule(static, 1)
+
       for (Index_type i = ibegin; i < iend; ++i ) {
         TRAP_INT_BODY;
       }
@@ -77,7 +77,7 @@ void TRAP_INT::runOpenMPTargetVariant(VariantID vid)
     }
     stopTimer();
 
-    #pragma omp target exit data map(delete: x0,xp,y,yp,h) 
+    #pragma omp target exit data map(delete: x0,xp,y,yp,h)
 
   } else if ( vid == RAJA_OpenMPTarget ) {
 
@@ -101,7 +101,7 @@ void TRAP_INT::runOpenMPTargetVariant(VariantID vid)
     TRAP_INT_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  TRAP_INT : Unknown OMP Targetvariant id = " << vid << std::endl;
+     getCout() << "\n  TRAP_INT : Unknown OMP Targetvariant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/basic/TRAP_INT-Seq.cpp b/src/basic/TRAP_INT-Seq.cpp
index a1d657392..310d5e9ef 100644
--- a/src/basic/TRAP_INT-Seq.cpp
+++ b/src/basic/TRAP_INT-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,7 +12,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace basic
 {
@@ -32,7 +32,7 @@ Real_type trap_int_func(Real_type x,
 }
 
 
-void TRAP_INT::runSeqVariant(VariantID vid)
+void TRAP_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -108,7 +108,7 @@ void TRAP_INT::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  TRAP_INT : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  TRAP_INT : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp
index e7483d9f4..3bf939f38 100644
--- a/src/basic/TRAP_INT.cpp
+++ b/src/basic/TRAP_INT.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -57,7 +57,7 @@ TRAP_INT::~TRAP_INT()
 {
 }
 
-void TRAP_INT::setUp(VariantID vid)
+void TRAP_INT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   Real_type xn;
   initData(xn, vid);
@@ -74,12 +74,12 @@ void TRAP_INT::setUp(VariantID vid)
   m_sumx = 0;
 }
 
-void TRAP_INT::updateChecksum(VariantID vid)
+void TRAP_INT::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += m_sumx;
+  checksum[vid][tune_idx] += m_sumx;
 }
 
-void TRAP_INT::tearDown(VariantID vid)
+void TRAP_INT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
 }
diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp
index 171d72418..50acfeb79 100644
--- a/src/basic/TRAP_INT.hpp
+++ b/src/basic/TRAP_INT.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -58,17 +58,27 @@ class TRAP_INT : public KernelBase
 
   ~TRAP_INT();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_type m_x0;
   Real_type m_xp;
   Real_type m_y;
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index a673d2e43..0e459fa62 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp
index 0467d0f19..5010f982e 100644
--- a/src/common/CudaDataUtils.hpp
+++ b/src/common/CudaDataUtils.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,6 +18,7 @@
 
 #if defined(RAJA_ENABLE_CUDA)
 
+#include "common/GPUUtils.hpp"
 
 #include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
 
@@ -25,6 +26,19 @@
 namespace rajaperf
 {
 
+/*!
+ * \brief Device timer, returns a time in ns from an arbitrary starting point.
+ * Note that this time is consistent across the whole device.
+ */
+__device__ __forceinline__ unsigned long long device_timer()
+{
+  unsigned long long global_timer = 0;
+#if __CUDA_ARCH__ >= 300
+  asm volatile ("mov.u64 %0, %globaltimer;" : "=l"(global_timer));
+#endif
+  return global_timer;
+}
+
 /*!
  * \brief Simple forall cuda kernel that runs a lambda.
  */
@@ -36,6 +50,16 @@ __global__ void lambda_cuda_forall(Index_type ibegin, Index_type iend, Lambda bo
     body(i);
   }
 }
+///
+template < size_t block_size, typename Lambda >
+__launch_bounds__(block_size)
+__global__ void lambda_cuda_forall(Index_type ibegin, Index_type iend, Lambda body)
+{
+  Index_type i = ibegin + blockIdx.x * block_size + threadIdx.x;
+  if (i < iend) {
+    body(i);
+  }
+}
 
 /*!
  * \brief Simple cuda kernel that runs a lambda.
@@ -43,7 +67,14 @@ __global__ void lambda_cuda_forall(Index_type ibegin, Index_type iend, Lambda bo
 template < typename Lambda >
 __global__ void lambda_cuda(Lambda body)
 {
-    body();
+  body();
+}
+///
+template < size_t block_size, typename Lambda >
+__launch_bounds__(block_size)
+__global__ void lambda_cuda(Lambda body)
+{
+  body();
 }
 
 /*!
diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp
index 41b089fa3..6856d1f6c 100644
--- a/src/common/DataUtils.cpp
+++ b/src/common/DataUtils.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -40,8 +40,7 @@ void incDataInitCount()
  */
 void allocAndInitData(Int_ptr& ptr, int len, VariantID vid)
 {
-  // Should we do this differently for alignment?? If so, change dealloc()
-  ptr = new Int_type[len];
+  allocData(ptr, len);
   initData(ptr, len, vid);
 }
 
@@ -50,44 +49,56 @@ void allocAndInitData(Int_ptr& ptr, int len, VariantID vid)
  */
 void allocAndInitData(Real_ptr& ptr, int len, VariantID vid )
 {
-  ptr = 
-    RAJA::allocate_aligned_type<Real_type>(RAJA::DATA_ALIGN, 
-                                           len*sizeof(Real_type));
+  allocData(ptr, len);
   initData(ptr, len, vid);
 }
 
 void allocAndInitDataConst(Real_ptr& ptr, int len, Real_type val,
                            VariantID vid)
 {
-  (void) vid;
-
-  ptr = 
-    RAJA::allocate_aligned_type<Real_type>(RAJA::DATA_ALIGN, 
-                                           len*sizeof(Real_type));
+  allocData(ptr, len);
   initDataConst(ptr, len, val, vid);
 }
 
 void allocAndInitDataRandSign(Real_ptr& ptr, int len, VariantID vid)
 {
-  ptr =
-    RAJA::allocate_aligned_type<Real_type>(RAJA::DATA_ALIGN,
-                                           len*sizeof(Real_type));
+  allocData(ptr, len);
   initDataRandSign(ptr, len, vid);
 }
 
 void allocAndInitDataRandValue(Real_ptr& ptr, int len, VariantID vid)
+{
+  allocData(ptr, len);
+  initDataRandValue(ptr, len, vid);
+}
+
+void allocAndInitData(Complex_ptr& ptr, int len, VariantID vid)
+{
+  allocData(ptr, len);
+  initData(ptr, len, vid);
+}
+
+
+/*
+ * Allocate data arrays of given type.
+ */
+void allocData(Int_ptr& ptr, int len)
+{
+  // Should we do this differently for alignment?? If so, change dealloc()
+  ptr = new Int_type[len];
+}
+
+void allocData(Real_ptr& ptr, int len)
 {
   ptr =
     RAJA::allocate_aligned_type<Real_type>(RAJA::DATA_ALIGN,
                                            len*sizeof(Real_type));
-  initDataRandValue(ptr, len, vid);
 }
 
-void allocAndInitData(Complex_ptr& ptr, int len, VariantID vid)
+void allocData(Complex_ptr& ptr, int len)
 {
   // Should we do this differently for alignment?? If so, change dealloc()
   ptr = new Complex_type[len];
-  initData(ptr, len, vid);
 }
 
 
@@ -95,7 +106,7 @@ void allocAndInitData(Complex_ptr& ptr, int len, VariantID vid)
  * Free data arrays of given type.
  */
 void deallocData(Int_ptr& ptr)
-{ 
+{
   if (ptr) {
     delete [] ptr;
     ptr = 0;
@@ -103,7 +114,7 @@ void deallocData(Int_ptr& ptr)
 }
 
 void deallocData(Real_ptr& ptr)
-{ 
+{
   if (ptr) {
     RAJA::free_aligned(ptr);
     ptr = 0;
@@ -112,7 +123,7 @@ void deallocData(Real_ptr& ptr)
 
 void deallocData(Complex_ptr& ptr)
 {
-  if (ptr) { 
+  if (ptr) {
     delete [] ptr;
     ptr = 0;
   }
@@ -120,7 +131,7 @@ void deallocData(Complex_ptr& ptr)
 
 
 /*
- * \brief Initialize Int_type data array to 
+ * \brief Initialize Int_type data array to
  * randomly signed positive and negative values.
  */
 void initData(Int_ptr& ptr, int len, VariantID vid)
@@ -148,11 +159,11 @@ void initData(Int_ptr& ptr, int len, VariantID vid)
     ptr[i] = ( signfact < 0.5 ? -1 : 1 );
   };
 
-  signfact = Real_type(rand())/RAND_MAX; 
+  signfact = Real_type(rand())/RAND_MAX;
   Int_type ilo = len * signfact;
   ptr[ilo] = -58;
 
-  signfact = Real_type(rand())/RAND_MAX; 
+  signfact = Real_type(rand())/RAND_MAX;
   Int_type ihi = len * signfact;
   ptr[ihi] = 19;
 
@@ -160,11 +171,11 @@ void initData(Int_ptr& ptr, int len, VariantID vid)
 }
 
 /*
- * Initialize Real_type data array to non-random 
- * positive values (0.0, 1.0) based on their array position 
+ * Initialize Real_type data array to non-random
+ * positive values (0.0, 1.0) based on their array position
  * (index) and the order in which this method is called.
  */
-void initData(Real_ptr& ptr, int len, VariantID vid) 
+void initData(Real_ptr& ptr, int len, VariantID vid)
 {
   (void) vid;
 
@@ -172,19 +183,19 @@ void initData(Real_ptr& ptr, int len, VariantID vid)
 
 // first touch...
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
-  if ( vid == Base_OpenMP || 
+  if ( vid == Base_OpenMP ||
        vid == Lambda_OpenMP ||
        vid == RAJA_OpenMP ) {
     #pragma omp parallel for
-    for (int i = 0; i < len; ++i) { 
+    for (int i = 0; i < len; ++i) {
       ptr[i] = factor*(i + 1.1)/(i + 1.12345);
     };
-  } 
+  }
 #endif
 
   for (int i = 0; i < len; ++i) {
     ptr[i] = factor*(i + 1.1)/(i + 1.12345);
-  } 
+  }
 
   incDataInitCount();
 }
@@ -193,7 +204,7 @@ void initData(Real_ptr& ptr, int len, VariantID vid)
  * Initialize Real_type data array to constant values.
  */
 void initDataConst(Real_ptr& ptr, int len, Real_type val,
-                   VariantID vid) 
+                   VariantID vid)
 {
 
 // first touch...
@@ -289,10 +300,10 @@ void initData(Complex_ptr& ptr, int len, VariantID vid)
 
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
   if ( vid == Base_OpenMP ||
-       vid == Lambda_OpenMP || 
+       vid == Lambda_OpenMP ||
        vid == RAJA_OpenMP ) {
     #pragma omp parallel for
-    for (int i = 0; i < len; ++i) { 
+    for (int i = 0; i < len; ++i) {
       ptr[i] = factor*(i + 1.1)/(i + 1.12345);
     };
   }
@@ -322,18 +333,47 @@ void initData(Real_type& d, VariantID vid)
 /*
  * Calculate and return checksum for data arrays.
  */
-long double calcChecksum(const Real_ptr ptr, int len, 
+long double calcChecksum(const Int_ptr ptr, int len,
+                         Real_type scale_factor)
+{
+  long double tchk = 0.0;
+  long double ckahan = 0.0;
+  for (Index_type j = 0; j < len; ++j) {
+    long double x = (std::abs(std::sin(j+1.0))+0.5) * ptr[j];
+    long double y = x - ckahan;
+    volatile long double t = tchk + y;
+    volatile long double z = t - tchk;
+    ckahan = z - y;
+    tchk = t;
+#if 0 // RDH DEBUG
+    if ( (j % 100) == 0 ) {
+      getCout() << "j : tchk = " << j << " : " << tchk << std::endl;
+    }
+#endif
+  }
+  tchk *= scale_factor;
+  return tchk;
+}
+
+long double calcChecksum(const Real_ptr ptr, int len,
                          Real_type scale_factor)
 {
   long double tchk = 0.0;
+  long double ckahan = 0.0;
   for (Index_type j = 0; j < len; ++j) {
-    tchk += (j+1)*ptr[j]*scale_factor;
+    long double x = (std::abs(std::sin(j+1.0))+0.5) * ptr[j];
+    long double y = x - ckahan;
+    volatile long double t = tchk + y;
+    volatile long double z = t - tchk;
+    ckahan = z - y;
+    tchk = t;
 #if 0 // RDH DEBUG
     if ( (j % 100) == 0 ) {
-      std::cout << "j : tchk = " << j << " : " << tchk << std::endl;
+      getCout() << "j : tchk = " << j << " : " << tchk << std::endl;
     }
 #endif
   }
+  tchk *= scale_factor;
   return tchk;
 }
 
@@ -341,14 +381,21 @@ long double calcChecksum(const Complex_ptr ptr, int len,
                          Real_type scale_factor)
 {
   long double tchk = 0.0;
+  long double ckahan = 0.0;
   for (Index_type j = 0; j < len; ++j) {
-    tchk += (j+1)*(real(ptr[j])+imag(ptr[j]))*scale_factor;
+    long double x = (std::abs(std::sin(j+1.0))+0.5) * (real(ptr[j])+imag(ptr[j]));
+    long double y = x - ckahan;
+    volatile long double t = tchk + y;
+    volatile long double z = t - tchk;
+    ckahan = z - y;
+    tchk = t;
 #if 0 // RDH DEBUG
     if ( (j % 100) == 0 ) {
-      std::cout << "j : tchk = " << j << " : " << tchk << std::endl;
+      getCout() << "j : tchk = " << j << " : " << tchk << std::endl;
     }
 #endif
   }
+  tchk *= scale_factor;
   return tchk;
 }
 
diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp
index 7708eeb08..887f54f82 100644
--- a/src/common/DataUtils.hpp
+++ b/src/common/DataUtils.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -31,7 +31,7 @@
 namespace rajaperf
 {
 
-  
+
 /*!
  * Reset counter for data initialization.
  */
@@ -45,7 +45,7 @@ void incDataInitCount();
 
 /*!
  * \brief Allocate and initialize Int_type data array.
- * 
+ *
  * Array is initialized using method initData(Int_ptr& ptr...) below.
  */
 void allocAndInitData(Int_ptr& ptr, int len,
@@ -61,8 +61,8 @@ void allocAndInitData(Real_ptr& ptr, int len,
 
 /*!
  * \brief Allocate and initialize aligned Real_type data array.
- * 
- * Array entries are initialized using the method 
+ *
+ * Array entries are initialized using the method
  * initDataConst(Real_ptr& ptr...) below.
  */
 void allocAndInitDataConst(Real_ptr& ptr, int len, Real_type val,
@@ -91,6 +91,14 @@ void allocAndInitDataRandValue(Real_ptr& ptr, int len,
 void allocAndInitData(Complex_ptr& ptr, int len,
                       VariantID vid = NumVariants);
 
+/*!
+ * \brief Allocate data arrays.
+ */
+void allocData(Int_ptr& ptr, int len);
+///
+void allocData(Real_ptr& ptr, int len);
+///
+void allocData(Complex_ptr& ptr, int len);
 
 /*!
  * \brief Free data arrays.
@@ -104,9 +112,9 @@ void deallocData(Complex_ptr& ptr);
 
 /*!
  * \brief Initialize Int_type data array.
- * 
+ *
  * Array entries are randomly initialized to +/-1.
- * Then, two randomly-chosen entries are reset, one to 
+ * Then, two randomly-chosen entries are reset, one to
  * a value > 1, one to a value < -1.
  */
 void initData(Int_ptr& ptr, int len,
@@ -132,8 +140,8 @@ void initDataConst(Real_ptr& ptr, int len, Real_type val,
 
 /*!
  * \brief Initialize Real_type data array with random sign.
- * 
- * Array entries are initialized in the same way as the method 
+ *
+ * Array entries are initialized in the same way as the method
  * initData(Real_ptr& ptr...) above, but with random sign.
  */
 void initDataRandSign(Real_ptr& ptr, int len,
@@ -150,7 +158,7 @@ void initDataRandValue(Real_ptr& ptr, int len,
 /*!
  * \brief Initialize Complex_type data array.
  *
- * Real and imaginary array entries are initialized in the same way as the 
+ * Real and imaginary array entries are initialized in the same way as the
  * method allocAndInitData(Real_ptr& ptr...) above.
  */
 void initData(Complex_ptr& ptr, int len,
@@ -159,7 +167,7 @@ void initData(Complex_ptr& ptr, int len,
 /*!
  * \brief Initialize Real_type scalar data.
  *
- * Data is set similarly to an array enttry in the method 
+ * Data is set similarly to an array enttry in the method
  * initData(Real_ptr& ptr...) above.
  */
 void initData(Real_type& d,
@@ -167,13 +175,16 @@ void initData(Real_type& d,
 
 /*!
  * \brief Calculate and return checksum for data arrays.
- * 
+ *
  * Checksums are computed as a weighted sum of array entries,
  * where weight is a simple function of elemtn index.
  *
  * Checksumn is multiplied by given scale factor.
  */
-long double calcChecksum(Real_ptr d, int len, 
+long double calcChecksum(Int_ptr d, int len,
+                         Real_type scale_factor = 1.0);
+///
+long double calcChecksum(Real_ptr d, int len,
                          Real_type scale_factor = 1.0);
 ///
 long double calcChecksum(Complex_ptr d, int len,
diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp
index 17b772e3d..7b5db6887 100644
--- a/src/common/Executor.cpp
+++ b/src/common/Executor.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,14 +12,21 @@
 #include "common/KernelBase.hpp"
 #include "common/OutputUtils.hpp"
 
+#ifdef RAJA_PERFSUITE_ENABLE_MPI
+#include <mpi.h>
+#endif
+
 // Warmup kernels to run first to help reduce startup overheads in timings
 #include "basic/DAXPY.hpp"
 #include "basic/REDUCE3_INT.hpp"
+#include "basic/INDEXLIST_3LOOP.hpp"
 #include "algorithm/SORT.hpp"
+#include "apps/HALOEXCHANGE_FUSED.hpp"
 
 #include <list>
 #include <vector>
 #include <string>
+#include <unordered_map>
 
 #include <iostream>
 #include <iomanip>
@@ -36,7 +43,8 @@ using namespace std;
 
 Executor::Executor(int argc, char** argv)
   : run_params(argc, argv),
-    reference_vid(NumVariants)
+    reference_vid(NumVariants),
+    reference_tune_idx(KernelBase::getUnknownTuningIdx())
 {
 }
 
@@ -56,13 +64,169 @@ void Executor::setupSuite()
     return;
   }
 
-  cout << "\nSetting up suite based on input..." << endl;
+  getCout() << "\nSetting up suite based on input..." << endl;
 
   using Slist = list<string>;
   using Svector = vector<string>;
+  using COvector = vector<RunParams::CombinerOpt>;
   using KIDset = set<KernelID>;
   using VIDset = set<VariantID>;
 
+  //
+  // Determine which kernels to exclude from input.
+  // exclude_kern will be non-duplicated ordered set of IDs of kernel to exclude.
+  //
+  const Svector& npasses_combiner_input = run_params.getNpassesCombinerOptInput();
+  if ( !npasses_combiner_input.empty() ) {
+
+    COvector combiners;
+    Svector invalid;
+    for (const std::string& combiner_name : npasses_combiner_input) {
+
+      if (combiner_name == RunParams::CombinerOptToStr(RunParams::CombinerOpt::Average)) {
+        combiners.emplace_back(RunParams::CombinerOpt::Average);
+      } else if (combiner_name == RunParams::CombinerOptToStr(RunParams::CombinerOpt::Minimum)) {
+        combiners.emplace_back(RunParams::CombinerOpt::Minimum);
+      } else if (combiner_name == RunParams::CombinerOptToStr(RunParams::CombinerOpt::Maximum)) {
+        combiners.emplace_back(RunParams::CombinerOpt::Maximum);
+      } else {
+        invalid.emplace_back(combiner_name);
+      }
+
+    }
+
+    run_params.setNpassesCombinerOpts(combiners);
+    run_params.setInvalidNpassesCombinerOptInput(invalid);
+
+  }
+
+  //
+  // Determine which kernels to exclude from input.
+  // exclude_kern will be non-duplicated ordered set of IDs of kernel to exclude.
+  //
+  const Svector& exclude_kernel_input = run_params.getExcludeKernelInput();
+  const Svector& exclude_feature_input = run_params.getExcludeFeatureInput();
+
+  KIDset exclude_kern;
+
+  if ( !exclude_kernel_input.empty() ) {
+
+    // Make list copy of exclude kernel name input to manipulate for
+    // processing potential group names and/or kernel names, next
+    Slist exclude_kern_names(exclude_kernel_input.begin(), exclude_kernel_input.end());
+
+    //
+    // Search exclude_kern_names for matching group names.
+    // groups2exclude will contain names of groups to exclude.
+    //
+    Svector groups2exclude;
+    for (Slist::iterator it = exclude_kern_names.begin(); it != exclude_kern_names.end(); ++it)
+    {
+      for (size_t ig = 0; ig < NumGroups; ++ig) {
+        const string& group_name = getGroupName(static_cast<GroupID>(ig));
+        if ( group_name == *it ) {
+          groups2exclude.push_back(group_name);
+        }
+      }
+    }
+
+    //
+    // If group name(s) found in exclude_kern_names, assemble kernels in group(s)
+    // to run and remove those group name(s) from exclude_kern_names list.
+    //
+    for (size_t ig = 0; ig < groups2exclude.size(); ++ig) {
+      const string& gname(groups2exclude[ig]);
+
+      for (size_t ik = 0; ik < NumKernels; ++ik) {
+        KernelID kid = static_cast<KernelID>(ik);
+        if ( getFullKernelName(kid).find(gname) != string::npos ) {
+          exclude_kern.insert(kid);
+        }
+      }
+
+      exclude_kern_names.remove(gname);
+    }
+
+    //
+    // Look for matching names of individual kernels in remaining exclude_kern_names.
+    //
+    // Assemble invalid input for warning message.
+    //
+    Svector invalid;
+
+    for (Slist::iterator it = exclude_kern_names.begin(); it != exclude_kern_names.end(); ++it)
+    {
+      bool found_it = false;
+
+      for (size_t ik = 0; ik < NumKernels && !found_it; ++ik) {
+        KernelID kid = static_cast<KernelID>(ik);
+        if ( getKernelName(kid) == *it || getFullKernelName(kid) == *it ) {
+          exclude_kern.insert(kid);
+          found_it = true;
+        }
+      }
+
+      if ( !found_it )  invalid.push_back(*it);
+    }
+
+    run_params.setInvalidExcludeKernelInput(invalid);
+
+  }
+
+  if ( !exclude_feature_input.empty() ) {
+
+    // First, check for invalid exclude_feature input.
+    // Assemble invalid input for warning message.
+    //
+    Svector invalid;
+
+    for (size_t i = 0; i < exclude_feature_input.size(); ++i) {
+      bool found_it = false;
+
+      for (size_t fid = 0; fid < NumFeatures && !found_it; ++fid) {
+        FeatureID tfid = static_cast<FeatureID>(fid);
+        if ( getFeatureName(tfid) == exclude_feature_input[i] ) {
+          found_it = true;
+        }
+      }
+
+      if ( !found_it )  invalid.push_back( exclude_feature_input[i] );
+    }
+    run_params.setInvalidExcludeFeatureInput(invalid);
+
+    //
+    // If feature input is valid, determine which kernels use
+    // input-specified features and add to set of kernels to run.
+    //
+    if ( run_params.getInvalidExcludeFeatureInput().empty() ) {
+
+      for (size_t i = 0; i < exclude_feature_input.size(); ++i) {
+
+        const string& feature = exclude_feature_input[i];
+
+        bool found_it = false;
+        for (size_t fid = 0; fid < NumFeatures && !found_it; ++fid) {
+          FeatureID tfid = static_cast<FeatureID>(fid);
+          if ( getFeatureName(tfid) == feature ) {
+            found_it = true;
+
+            for (int kid = 0; kid < NumKernels; ++kid) {
+              KernelID tkid = static_cast<KernelID>(kid);
+              KernelBase* kern = getKernelObject(tkid, run_params);
+              if ( kern->usesFeature(tfid) ) {
+                 exclude_kern.insert( tkid );
+              }
+              delete kern;
+            }  // loop over kernels
+
+          }  // if input feature name matches feature id
+        }  // loop over feature ids until name match is found
+
+      }  // loop over feature name input
+
+    }  // if feature name input is valid
+  }
+
   //
   // Determine which kernels to execute from input.
   // run_kern will be non-duplicated ordered set of IDs of kernel to run.
@@ -75,10 +239,13 @@ void Executor::setupSuite()
   if ( kernel_input.empty() && feature_input.empty() ) {
 
     //
-    // No kernels or fatures specified in input, run them all...
+    // No kernels or features specified in input, run them all...
     //
-    for (size_t ik = 0; ik < NumKernels; ++ik) {
-      run_kern.insert( static_cast<KernelID>(ik) );
+    for (size_t kid = 0; kid < NumKernels; ++kid) {
+      KernelID tkid = static_cast<KernelID>(kid);
+      if (exclude_kern.find(tkid) == exclude_kern.end()) {
+        run_kern.insert( tkid );
+      }
     }
 
   } else {
@@ -130,7 +297,8 @@ void Executor::setupSuite()
               for (int kid = 0; kid < NumKernels; ++kid) {
                 KernelID tkid = static_cast<KernelID>(kid);
                 KernelBase* kern = getKernelObject(tkid, run_params);
-                if ( kern->usesFeature(tfid) ) {
+                if ( kern->usesFeature(tfid) &&
+                     exclude_kern.find(tkid) == exclude_kern.end() ) {
                    run_kern.insert( tkid );
                 }
                 delete kern;
@@ -171,10 +339,11 @@ void Executor::setupSuite()
     for (size_t ig = 0; ig < groups2run.size(); ++ig) {
       const string& gname(groups2run[ig]);
 
-      for (size_t ik = 0; ik < NumKernels; ++ik) {
-        KernelID kid = static_cast<KernelID>(ik);
-        if ( getFullKernelName(kid).find(gname) != string::npos ) {
-          run_kern.insert(kid);
+      for (size_t kid = 0; kid < NumKernels; ++kid) {
+        KernelID tkid = static_cast<KernelID>(kid);
+        if ( getFullKernelName(tkid).find(gname) != string::npos &&
+             exclude_kern.find(tkid) == exclude_kern.end()) {
+          run_kern.insert(tkid);
         }
       }
 
@@ -192,10 +361,12 @@ void Executor::setupSuite()
     {
       bool found_it = false;
 
-      for (size_t ik = 0; ik < NumKernels && !found_it; ++ik) {
-        KernelID kid = static_cast<KernelID>(ik);
-        if ( getKernelName(kid) == *it || getFullKernelName(kid) == *it ) {
-          run_kern.insert(kid);
+      for (size_t kid = 0; kid < NumKernels && !found_it; ++kid) {
+        KernelID tkid = static_cast<KernelID>(kid);
+        if ( getKernelName(tkid) == *it || getFullKernelName(tkid) == *it ) {
+          if (exclude_kern.find(tkid) == exclude_kern.end()) {
+            run_kern.insert(tkid);
+          }
           found_it = true;
         }
       }
@@ -220,6 +391,44 @@ void Executor::setupSuite()
     }
   }
 
+
+  //
+  // Determine variants to execute from input.
+  // run_var will be non-duplicated ordered set of IDs of variants to run.
+  //
+  const Svector& exclude_variant_names = run_params.getExcludeVariantInput();
+
+  VIDset exclude_var;
+
+  if ( !exclude_variant_names.empty() ) {
+
+    //
+    // Parse input to determine which variants to exclude.
+    //
+    // Assemble invalid input for warning message.
+    //
+
+    Svector invalid;
+
+    for (size_t it = 0; it < exclude_variant_names.size(); ++it) {
+      bool found_it = false;
+
+      for (VIDset::iterator vid_it = available_var.begin();
+         vid_it != available_var.end(); ++vid_it) {
+        VariantID vid = *vid_it;
+        if ( getVariantName(vid) == exclude_variant_names[it] ) {
+          exclude_var.insert(vid);
+          found_it = true;
+        }
+      }
+
+      if ( !found_it )  invalid.push_back(exclude_variant_names[it]);
+    }
+
+    run_params.setInvalidExcludeVariantInput(invalid);
+
+  }
+
   //
   // Determine variants to execute from input.
   // run_var will be non-duplicated ordered set of IDs of variants to run.
@@ -237,9 +446,12 @@ void Executor::setupSuite()
     for (VIDset::iterator vid_it = available_var.begin();
          vid_it != available_var.end(); ++vid_it) {
       VariantID vid = *vid_it;
-      run_var.insert( vid );
-      if ( getVariantName(vid) == run_params.getReferenceVariant() ) {
-        reference_vid = vid;
+      if (exclude_var.find(vid) == exclude_var.end()) {
+        run_var.insert( vid );
+        if ( getVariantName(vid) == run_params.getReferenceVariant() ) {
+          reference_vid = vid;
+          reference_tune_idx = 0;
+        }
       }
     }
 
@@ -248,6 +460,7 @@ void Executor::setupSuite()
     //
     if ( run_params.getReferenceVariant().empty() && !run_var.empty() ) {
       reference_vid = *run_var.begin();
+      reference_tune_idx = 0;
     }
 
   } else {
@@ -271,9 +484,12 @@ void Executor::setupSuite()
          vid_it != available_var.end(); ++vid_it) {
         VariantID vid = *vid_it;
         if ( getVariantName(vid) == variant_names[it] ) {
-          run_var.insert(vid);
-          if ( getVariantName(vid) == run_params.getReferenceVariant() ) {
-            reference_vid = vid;
+          if (exclude_var.find(vid) == exclude_var.end()) {
+            run_var.insert(vid);
+            if ( getVariantName(vid) == run_params.getReferenceVariant() ) {
+              reference_vid = vid;
+              reference_tune_idx = 0;
+            }
           }
           found_it = true;
         }
@@ -287,6 +503,7 @@ void Executor::setupSuite()
     //
     if ( run_params.getReferenceVariant().empty() && !run_var.empty() ) {
       reference_vid = *run_var.begin();
+      reference_tune_idx = 0;
     }
 
     run_params.setInvalidVariantInput(invalid);
@@ -300,11 +517,17 @@ void Executor::setupSuite()
   // A message will be emitted later so user can sort it out...
   //
 
-  if ( !(run_params.getInvalidKernelInput().empty()) ) {
+  if ( !(run_params.getInvalidNpassesCombinerOptInput().empty()) ) {
 
     run_params.setInputState(RunParams::BadInput);
 
-  } else if ( !(run_params.getInvalidFeatureInput().empty()) ) {
+  } else if ( !(run_params.getInvalidKernelInput().empty()) ||
+              !(run_params.getInvalidExcludeKernelInput().empty()) ) {
+
+    run_params.setInputState(RunParams::BadInput);
+
+  } else if ( !(run_params.getInvalidFeatureInput().empty()) ||
+              !(run_params.getInvalidExcludeFeatureInput().empty()) ) {
 
     run_params.setInputState(RunParams::BadInput);
 
@@ -319,7 +542,8 @@ void Executor::setupSuite()
       }
     }
 
-    if ( !(run_params.getInvalidVariantInput().empty()) ) {
+    if ( !(run_params.getInvalidVariantInput().empty()) ||
+         !(run_params.getInvalidExcludeVariantInput().empty()) ) {
 
        run_params.setInputState(RunParams::BadInput);
 
@@ -330,6 +554,35 @@ void Executor::setupSuite()
         variant_ids.push_back( *vid );
       }
 
+      //
+      // Make a single ordering of tuning names for each variant across kernels.
+      //
+      for (VariantID vid : variant_ids) {
+        std::unordered_map<std::string, size_t> tuning_names_order_map;
+        for (const KernelBase* kernel : kernels) {
+          for (std::string const& tuning_name :
+               kernel->getVariantTuningNames(vid)) {
+            if (tuning_names_order_map.find(tuning_name) ==
+                tuning_names_order_map.end()) {
+              tuning_names_order_map.emplace(
+                  tuning_name, tuning_names_order_map.size());
+            }
+          }
+        }
+        tuning_names[vid].resize(tuning_names_order_map.size());
+        for (auto const& tuning_name_idx_pair : tuning_names_order_map) {
+          tuning_names[vid][tuning_name_idx_pair.second] = tuning_name_idx_pair.first;
+        }
+        // reorder to put "default" first
+        auto default_order_iter = tuning_names_order_map.find(KernelBase::getDefaultTuningName());
+        if (default_order_iter != tuning_names_order_map.end()) {
+          size_t default_idx = default_order_iter->second;
+          std::string default_name = std::move(tuning_names[vid][default_idx]);
+          tuning_names[vid].erase(tuning_names[vid].begin()+default_idx);
+          tuning_names[vid].emplace(tuning_names[vid].begin(), std::move(default_name));
+        }
+      }
+
       //
       // If we've gotten to this point, we have good input to run.
       //
@@ -403,10 +656,12 @@ void Executor::reportRunSummary(ostream& str) const
 
     str << "\nThe following kernels and variants (when available for a kernel) will be run:" << endl;
 
-    str << "\nVariants"
+    str << "\nVariants and Tunings"
         << "\n--------\n";
     for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
-      str << getVariantName(variant_ids[iv]) << endl;
+      for (std::string const& tuning_name : tuning_names[variant_ids[iv]]) {
+        str << getVariantName(variant_ids[iv]) << "-" << tuning_name<< endl;
+      }
     }
 
     str << endl;
@@ -422,6 +677,15 @@ void Executor::reportRunSummary(ostream& str) const
 
 void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const
 {
+  if ( to_file ) {
+#ifdef RAJA_PERFSUITE_ENABLE_MPI
+    int num_ranks;
+    MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);
+    str << "Kernels run on " << num_ranks << " MPI ranks" << endl;
+#else
+    str << "Kernels run without MPI" << endl;
+#endif
+  }
 
 //
 // Set up column headers and column widths for kernel summary output.
@@ -469,7 +733,7 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const
   dash_width += itsrep_width + static_cast<Index_type>(sepchr.size());
 
   string kernsrep_head("Kernels/rep");
-  Index_type kernsrep_width = 
+  Index_type kernsrep_width =
     max( static_cast<Index_type>(kernsrep_head.size()),
          static_cast<Index_type>(4) );
   dash_width += kernsrep_width + static_cast<Index_type>(sepchr.size());
@@ -486,13 +750,14 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const
                          static_cast<Index_type>(frsize) ) + 3;
   dash_width += flopsrep_width + static_cast<Index_type>(sepchr.size());
 
-  str <<left<< setw(kercol_width) << kern_head 
+  str <<left<< setw(kercol_width) << kern_head
       << sepchr <<right<< setw(psize_width) << psize_head
       << sepchr <<right<< setw(reps_width) << rsize_head
       << sepchr <<right<< setw(itsrep_width) << itsrep_head
       << sepchr <<right<< setw(kernsrep_width) << kernsrep_head
       << sepchr <<right<< setw(bytesrep_width) << bytesrep_head
-      << sepchr <<right<< setw(flopsrep_width) << flopsrep_head << endl;
+      << sepchr <<right<< setw(flopsrep_width) << flopsrep_head
+      << endl;
 
   if ( !to_file ) {
     for (Index_type i = 0; i < dash_width; ++i) {
@@ -509,7 +774,7 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const
         << sepchr <<right<< setw(itsrep_width) << kern->getItsPerRep()
         << sepchr <<right<< setw(kernsrep_width) << kern->getKernelsPerRep()
         << sepchr <<right<< setw(bytesrep_width) << kern->getBytesPerRep()
-        << sepchr <<right<< setw(flopsrep_width) << kern->getFLOPsPerRep() 
+        << sepchr <<right<< setw(flopsrep_width) << kern->getFLOPsPerRep()
         << endl;
   }
 
@@ -525,71 +790,79 @@ void Executor::runSuite()
     return;
   }
 
-  cout << "\n\nRun warmup kernels...\n";
+  getCout() << "\n\nRun warmup kernels...\n";
 
   vector<KernelBase*> warmup_kernels;
 
-  warmup_kernels.push_back(new basic::DAXPY(run_params)); 
-  warmup_kernels.push_back(new basic::REDUCE3_INT(run_params)); 
-  warmup_kernels.push_back(new algorithm::SORT(run_params)); 
+  warmup_kernels.push_back(makeKernel<basic::DAXPY>());
+  warmup_kernels.push_back(makeKernel<basic::REDUCE3_INT>());
+  warmup_kernels.push_back(makeKernel<basic::INDEXLIST_3LOOP>());
+  warmup_kernels.push_back(makeKernel<algorithm::SORT>());
+  warmup_kernels.push_back(makeKernel<apps::HALOEXCHANGE_FUSED>());
 
   for (size_t ik = 0; ik < warmup_kernels.size(); ++ik) {
     KernelBase* warmup_kernel = warmup_kernels[ik];
-    cout << "Kernel : " << warmup_kernel->getName() << endl;
-    for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
-      VariantID vid = variant_ids[iv];
-      if ( run_params.showProgress() ) {
-        if ( warmup_kernel->hasVariantDefined(vid) ) {
-          cout << "   Running ";
-        } else {
-          cout << "   No ";
-        }
-        cout << getVariantName(vid) << " variant" << endl;
-      }
-      if ( warmup_kernel->hasVariantDefined(vid) ) {
-        warmup_kernel->execute(vid);
-      }
-    }
-    delete warmup_kernels[ik];
+    runKernel(warmup_kernel, true);
+    delete warmup_kernel;
+    warmup_kernels[ik] = nullptr;
   }
 
 
-  cout << "\n\nRunning specified kernels and variants...\n";
+  getCout() << "\n\nRunning specified kernels and variants...\n";
 
   const int npasses = run_params.getNumPasses();
   for (int ip = 0; ip < npasses; ++ip) {
     if ( run_params.showProgress() ) {
-      std::cout << "\nPass through suite # " << ip << "\n";
+      getCout() << "\nPass through suite # " << ip << "\n";
     }
 
     for (size_t ik = 0; ik < kernels.size(); ++ik) {
       KernelBase* kernel = kernels[ik];
-      if ( run_params.showProgress() ) {
-        std::cout << "\nRun kernel -- " << kernel->getName() << "\n";
-      }
-
-      for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
-         VariantID vid = variant_ids[iv];
-         KernelBase* kern = kernels[ik];
-         if ( run_params.showProgress() ) {
-           if ( kern->hasVariantDefined(vid) ) {
-             cout << "   Running ";
-           } else {
-             cout << "   No ";
-           }
-           cout << getVariantName(vid) << " variant" << endl;
-         }
-         if ( kern->hasVariantDefined(vid) ) {
-           kernels[ik]->execute(vid);
-         }
-      } // loop over variants
-
+      runKernel(kernel, false);
     } // loop over kernels
 
   } // loop over passes through suite
 
 }
 
+template < typename Kernel >
+KernelBase* Executor::makeKernel()
+{
+  Kernel* kernel = new Kernel(run_params);
+  return kernel;
+}
+
+void Executor::runKernel(KernelBase* kernel, bool print_kernel_name)
+{
+  if ( run_params.showProgress() || print_kernel_name) {
+    getCout()  << endl << "Run kernel -- " << kernel->getName() << endl;
+  }
+  for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
+    VariantID vid = variant_ids[iv];
+
+    if ( run_params.showProgress() ) {
+      if ( kernel->hasVariantDefined(vid) ) {
+        getCout() << "   Running ";
+      } else {
+        getCout() << "   No ";
+      }
+      getCout() << getVariantName(vid) << " variant" << endl;
+    }
+
+    for (size_t tune_idx = 0; tune_idx < kernel->getNumVariantTunings(vid); ++tune_idx) {
+
+      if ( run_params.showProgress() ) {
+        getCout() << "     Running "
+                  << kernel->getVariantTuningName(vid, tune_idx) << " tuning";
+      }
+      kernel->execute(vid, tune_idx);
+      if ( run_params.showProgress() ) {
+        getCout() << " -- " << kernel->getLastTime() << " sec." << endl;
+      }
+    }
+  } // loop over variants
+}
+
 void Executor::outputRunData()
 {
   RunParams::InputOpt in_state = run_params.getInputState();
@@ -598,7 +871,7 @@ void Executor::outputRunData()
     return;
   }
 
-  cout << "\n\nGenerate run report files...\n";
+  getCout() << "\n\nGenerate run report files...\n";
 
   //
   // Generate output file prefix (including directory path).
@@ -610,41 +883,57 @@ void Executor::outputRunData()
   }
   out_fprefix = "./" + run_params.getOutputFilePrefix();
 
-  string filename = out_fprefix + "-timing.csv";
-  writeCSVReport(filename, CSVRepMode::Timing, 6 /* prec */);
+  unique_ptr<ostream> file;
 
-  if ( haveReferenceVariant() ) {
-    filename = out_fprefix + "-speedup.csv";
-    writeCSVReport(filename, CSVRepMode::Speedup, 3 /* prec */);
-  }
 
-  filename = out_fprefix + "-checksum.txt";
-  writeChecksumReport(filename);
+  for (RunParams::CombinerOpt combiner : run_params.getNpassesCombinerOpts()) {
+    file = openOutputFile(out_fprefix + "-timing-" + RunParams::CombinerOptToStr(combiner) + ".csv");
+    writeCSVReport(*file, CSVRepMode::Timing, combiner, 6 /* prec */);
 
-  filename = out_fprefix + "-fom.csv";
-  writeFOMReport(filename);
+    if ( haveReferenceVariant() ) {
+      file = openOutputFile(out_fprefix + "-speedup-" + RunParams::CombinerOptToStr(combiner) + ".csv");
+      writeCSVReport(*file, CSVRepMode::Speedup, combiner, 3 /* prec */);
+    }
+  }
+
+  file = openOutputFile(out_fprefix + "-checksum.txt");
+  writeChecksumReport(*file);
 
-  filename = out_fprefix + "-kernels.csv";
-  ofstream file(filename.c_str(), ios::out | ios::trunc);
-  if ( !file ) {
-    cout << " ERROR: Can't open output file " << filename << endl;
+  {
+    vector<FOMGroup> fom_groups;
+    getFOMGroups(fom_groups);
+    if (!fom_groups.empty() ) {
+      file = openOutputFile(out_fprefix + "-fom.csv");
+      writeFOMReport(*file, fom_groups);
+    }
   }
 
-  if ( file ) {
+  file = openOutputFile(out_fprefix + "-kernels.csv");
+  if ( *file ) {
     bool to_file = true;
-    writeKernelInfoSummary(file, to_file);
+    writeKernelInfoSummary(*file, to_file);
   }
 }
 
-
-void Executor::writeCSVReport(const string& filename, CSVRepMode mode,
-                              size_t prec)
+unique_ptr<ostream> Executor::openOutputFile(const string& filename) const
 {
-  ofstream file(filename.c_str(), ios::out | ios::trunc);
-  if ( !file ) {
-    cout << " ERROR: Can't open output file " << filename << endl;
+  int rank = 0;
+#ifdef RAJA_PERFSUITE_ENABLE_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#endif
+  if (rank == 0) {
+    unique_ptr<ostream> file(new ofstream(filename.c_str(), ios::out | ios::trunc));
+    if ( !*file ) {
+      getCout() << " ERROR: Can't open output file " << filename << endl;
+    }
+    return file;
   }
+  return unique_ptr<ostream>(makeNullStream());
+}
 
+void Executor::writeCSVReport(ostream& file, CSVRepMode mode,
+                              RunParams::CombinerOpt combiner, size_t prec)
+{
   if ( file ) {
 
     //
@@ -659,32 +948,51 @@ void Executor::writeCSVReport(const string& filename, CSVRepMode mode,
     }
     kercol_width++;
 
-    vector<size_t> varcol_width(variant_ids.size());
+    vector<std::vector<size_t>> vartuncol_width(variant_ids.size());
     for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
-      varcol_width[iv] = max(prec+2, getVariantName(variant_ids[iv]).size());
+      size_t var_width = max(prec+2, getVariantName(variant_ids[iv]).size());
+      for (std::string const& tuning_name : tuning_names[variant_ids[iv]]) {
+        vartuncol_width[iv].emplace_back(max(var_width, tuning_name.size()));
+      }
     }
 
     //
     // Print title line.
     //
-    file << getReportTitle(mode);
+    file << getReportTitle(mode, combiner);
 
     //
     // Wrtie CSV file contents for report.
     //
 
     for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
-      file << sepchr;
+      for (size_t it = 0; it < tuning_names[variant_ids[iv]].size(); ++it) {
+        file << sepchr;
+      }
     }
     file << endl;
 
     //
-    // Print column title line.
+    // Print column variant name line.
+    //
+    file <<left<< setw(kercol_width) << kernel_col_name;
+    for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
+      for (size_t it = 0; it < tuning_names[variant_ids[iv]].size(); ++it) {
+        file << sepchr <<left<< setw(vartuncol_width[iv][it])
+             << getVariantName(variant_ids[iv]);
+      }
+    }
+    file << endl;
+
+    //
+    // Print column tuning name line.
     //
     file <<left<< setw(kercol_width) << kernel_col_name;
     for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
-      file << sepchr <<left<< setw(varcol_width[iv])
-           << getVariantName(variant_ids[iv]);
+      for (size_t it = 0; it < tuning_names[variant_ids[iv]].size(); ++it) {
+        file << sepchr <<left<< setw(vartuncol_width[iv][it])
+             << tuning_names[variant_ids[iv]][it];
+      }
     }
     file << endl;
 
@@ -696,17 +1004,21 @@ void Executor::writeCSVReport(const string& filename, CSVRepMode mode,
       file <<left<< setw(kercol_width) << kern->getName();
       for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
         VariantID vid = variant_ids[iv];
-        file << sepchr <<right<< setw(varcol_width[iv]);
-        if ( (mode == CSVRepMode::Speedup) &&
-             (!kern->hasVariantDefined(reference_vid) ||
-              !kern->hasVariantDefined(vid)) ) {
-          file << "Not run";
-        } else if ( (mode == CSVRepMode::Timing) &&
-                    !kern->hasVariantDefined(vid) ) {
-          file << "Not run";
-        } else {
-          file << setprecision(prec) << std::fixed
-               << getReportDataEntry(mode, kern, vid);
+        for (size_t it = 0; it < tuning_names[variant_ids[iv]].size(); ++it) {
+          std::string const& tuning_name = tuning_names[variant_ids[iv]][it];
+          file << sepchr <<right<< setw(vartuncol_width[iv][it]);
+          if ( (mode == CSVRepMode::Speedup) &&
+               (!kern->hasVariantTuningDefined(reference_vid, reference_tune_idx) ||
+                !kern->hasVariantTuningDefined(vid, tuning_name)) ) {
+            file << "Not run";
+          } else if ( (mode == CSVRepMode::Timing) &&
+                      !kern->hasVariantTuningDefined(vid, tuning_name) ) {
+            file << "Not run";
+          } else {
+            file << setprecision(prec) << std::fixed
+                 << getReportDataEntry(mode, combiner, kern, vid,
+                        kern->getVariantTuningIndex(vid, tuning_name));
+          }
         }
       }
       file << endl;
@@ -718,19 +1030,8 @@ void Executor::writeCSVReport(const string& filename, CSVRepMode mode,
 }
 
 
-void Executor::writeFOMReport(const string& filename)
+void Executor::writeFOMReport(ostream& file, vector<FOMGroup>& fom_groups)
 {
-  vector<FOMGroup> fom_groups;
-  getFOMGroups(fom_groups);
-  if (fom_groups.empty() ) {
-    return;
-  }
-
-  ofstream file(filename.c_str(), ios::out | ios::trunc);
-  if ( !file ) {
-    cout << " ERROR: Can't open output file " << filename << endl;
-  }
-
   if ( file ) {
 
     //
@@ -748,40 +1049,64 @@ void Executor::writeFOMReport(const string& filename)
 
     size_t fom_col_width = prec+14;
 
-    size_t ncols = 0;
+    std::vector<size_t> fom_group_ncols(fom_groups.size(), 0);
     for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) {
       const FOMGroup& group = fom_groups[ifg];
-      ncols += group.variants.size(); // num variants to compare
-                                      // to each PM baseline
+
+      for (size_t gv = 0; gv < group.variants.size(); ++gv) {
+        VariantID vid = group.variants[gv];
+        const string& variant_name = getVariantName(vid);
+        // num variants and tuning
+        // Includes the PM baseline and the variants and tunings to compared to it
+        fom_group_ncols[ifg] += tuning_names[vid].size();
+        for (const string& tuning_name : tuning_names[vid]) {
+          fom_col_width = max(fom_col_width, variant_name.size()+1+tuning_name.size());
+        }
+      }
     }
 
-    vector<int> col_exec_count(ncols, 0);
-    vector<double> col_min(ncols, numeric_limits<double>::max());
-    vector<double> col_max(ncols, -numeric_limits<double>::max());
-    vector<double> col_avg(ncols, 0.0);
-    vector<double> col_stddev(ncols, 0.0);
-    vector< vector<double> > pct_diff(kernels.size());
+    vector< vector<int> > col_exec_count(fom_groups.size());
+    vector< vector<double> > col_min(fom_groups.size());
+    vector< vector<double> > col_max(fom_groups.size());
+    vector< vector<double> > col_avg(fom_groups.size());
+    vector< vector<double> > col_stddev(fom_groups.size());
+    for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) {
+      col_exec_count[ifg].resize(fom_group_ncols[ifg], 0);
+      col_min[ifg].resize(fom_group_ncols[ifg], numeric_limits<double>::max());
+      col_max[ifg].resize(fom_group_ncols[ifg], -numeric_limits<double>::max());
+      col_avg[ifg].resize(fom_group_ncols[ifg], 0.0);
+      col_stddev[ifg].resize(fom_group_ncols[ifg], 0.0);
+    }
+    vector< vector< vector<double> > > pct_diff(kernels.size());
     for (size_t ik = 0; ik < kernels.size(); ++ik) {
-      pct_diff[ik] = vector<double>(ncols, 0.0);
+      pct_diff[ik].resize(fom_groups.size());
+      for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) {
+        pct_diff[ik][ifg].resize(fom_group_ncols[ifg], 0.0);
+      }
     }
 
     //
     // Print title line.
     //
     file << "FOM Report : signed speedup(-)/slowdown(+) for each PM (base vs. RAJA) -> (T_RAJA - T_base) / T_base )";
-    for (size_t iv = 0; iv < ncols*2; ++iv) {
-      file << sepchr;
+    for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) {
+      for (size_t iv = 0; iv < fom_group_ncols[ifg]*2; ++iv) {
+        file << sepchr;
+      }
     }
     file << endl;
 
     file << "'OVER_TOL' in column to right if RAJA speedup is over tolerance";
-    for (size_t iv = 0; iv < ncols*2; ++iv) {
-      file << sepchr;
+    for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) {
+      for (size_t iv = 0; iv < fom_group_ncols[ifg]*2; ++iv) {
+        file << sepchr;
+      }
     }
     file << endl;
 
     string pass(",        ");
     string fail(",OVER_TOL");
+    string base(",base_ref");
 
     //
     // Print column title line.
@@ -790,8 +1115,12 @@ void Executor::writeFOMReport(const string& filename)
     for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) {
       const FOMGroup& group = fom_groups[ifg];
       for (size_t gv = 0; gv < group.variants.size(); ++gv) {
-        string name = getVariantName(group.variants[gv]);
-        file << sepchr <<left<< setw(fom_col_width) << name << pass;
+        VariantID vid = group.variants[gv];
+        string variant_name = getVariantName(vid);
+        for (const string& tuning_name : tuning_names[vid]) {
+          file << sepchr <<left<< setw(fom_col_width)
+               << (variant_name+"-"+tuning_name) << pass;
+        }
       }
     }
     file << endl;
@@ -809,49 +1138,62 @@ void Executor::writeFOMReport(const string& filename)
 
       file <<left<< setw(kercol_width) << kern->getName();
 
-      int col = 0;
       for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) {
         const FOMGroup& group = fom_groups[ifg];
 
-        VariantID base_vid = group.base;
+        constexpr double unknown_totTime = -1.0;
+        double base_totTime = unknown_totTime;
 
+        size_t col = 0;
         for (size_t gv = 0; gv < group.variants.size(); ++gv) {
-          VariantID comp_vid = group.variants[gv];
-
-          //
-          // If kernel variant was run, generate data for it and
-          // print (signed) percentage difference from baseline.
-          //
-          if ( kern->wasVariantRun(comp_vid) ) {
-            col_exec_count[col]++;
-
-            pct_diff[ik][col] =
-              (kern->getTotTime(comp_vid) - kern->getTotTime(base_vid)) /
-               kern->getTotTime(base_vid);
-
-            string pfstring(pass);
-            if (pct_diff[ik][col] > run_params.getPFTolerance()) {
-              pfstring = fail;
-            }
+          VariantID vid = group.variants[gv];
 
-            file << sepchr << setw(fom_col_width) << setprecision(prec)
-                 <<left<< pct_diff[ik][col] <<right<< pfstring;
+          for (const string& tuning_name : tuning_names[vid]) {
+
+            size_t tune_idx = kern->getVariantTuningIndex(vid, tuning_name);
 
             //
-            // Gather data for column summaries (unsigned).
+            // If kernel variant was run, generate data for it and
+            // print (signed) percentage difference from baseline.
             //
-            col_min[col] = min( col_min[col], pct_diff[ik][col] );
-            col_max[col] = max( col_max[col], pct_diff[ik][col] );
-            col_avg[col] += pct_diff[ik][col];
+            if ( kern->wasVariantTuningRun(vid, tune_idx) ) {
+              col_exec_count[ifg][col]++;
 
-          } else {  // variant was not run, print a big fat goose egg...
+              bool is_base = (base_totTime == unknown_totTime);
+              if (is_base) {
+                base_totTime = kern->getTotTime(vid, tune_idx);
+              }
 
-            file << sepchr <<left<< setw(fom_col_width) << setprecision(prec)
-                 << 0.0 << pass;
+              pct_diff[ik][ifg][col] =
+                (kern->getTotTime(vid, tune_idx) - base_totTime) / base_totTime;
 
-          }
+              string pfstring(pass);
+              if (pct_diff[ik][ifg][col] > run_params.getPFTolerance()) {
+                pfstring = fail;
+              }
+              if (is_base) {
+                pfstring = base;
+              }
+
+              file << sepchr << setw(fom_col_width) << setprecision(prec)
+                   <<left<< pct_diff[ik][ifg][col] <<right<< pfstring;
+
+              //
+              // Gather data for column summaries (unsigned).
+              //
+              col_min[ifg][col] = min( col_min[ifg][col], pct_diff[ik][ifg][col] );
+              col_max[ifg][col] = max( col_max[ifg][col], pct_diff[ik][ifg][col] );
+              col_avg[ifg][col] += pct_diff[ik][ifg][col];
+
+            } else {  // variant was not run, print a big fat goose egg...
+
+              file << sepchr <<left<< setw(fom_col_width) << setprecision(prec)
+                   << 0.0 << pass;
+
+            }
 
-          col++;
+            col++;
+          }
 
         }  // loop over group variants
 
@@ -867,31 +1209,38 @@ void Executor::writeFOMReport(const string& filename)
     //
 
     // Column average...
-    for (size_t col = 0; col < ncols; ++col) {
-      if ( col_exec_count[col] > 0 ) {
-        col_avg[col] /= col_exec_count[col];
-      } else {
-        col_avg[col] = 0.0;
+    for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) {
+      for (size_t col = 0; col < fom_group_ncols[ifg]; ++col) {
+        if ( col_exec_count[ifg][col] > 0 ) {
+          col_avg[ifg][col] /= col_exec_count[ifg][col];
+        } else {
+          col_avg[ifg][col] = 0.0;
+        }
       }
     }
 
-    // Column standard deviaation...
+    // Column standard deviation...
     for (size_t ik = 0; ik < kernels.size(); ++ik) {
       KernelBase* kern = kernels[ik];
 
-      int col = 0;
       for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) {
         const FOMGroup& group = fom_groups[ifg];
 
+        int col = 0;
         for (size_t gv = 0; gv < group.variants.size(); ++gv) {
-          VariantID comp_vid = group.variants[gv];
+          VariantID vid = group.variants[gv];
 
-          if ( kern->wasVariantRun(comp_vid) ) {
-            col_stddev[col] += ( pct_diff[ik][col] - col_avg[col] ) *
-                               ( pct_diff[ik][col] - col_avg[col] );
-          }
+          for (const string& tuning_name : tuning_names[vid]) {
+
+            size_t tune_idx = kern->getVariantTuningIndex(vid, tuning_name);
+
+            if ( kern->wasVariantTuningRun(vid, tune_idx) ) {
+              col_stddev[ifg][col] += ( pct_diff[ik][ifg][col] - col_avg[ifg][col] ) *
+                                      ( pct_diff[ik][ifg][col] - col_avg[ifg][col] );
+            }
 
-          col++;
+            col++;
+          }
 
         } // loop over group variants
 
@@ -899,11 +1248,13 @@ void Executor::writeFOMReport(const string& filename)
 
     }  // loop over kernels
 
-    for (size_t col = 0; col < ncols; ++col) {
-      if ( col_exec_count[col] > 0 ) {
-        col_stddev[col] /= col_exec_count[col];
-      } else {
-        col_stddev[col] = 0.0;
+    for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) {
+      for (size_t col = 0; col < fom_group_ncols[ifg]; ++col) {
+        if ( col_exec_count[ifg][col] > 0 ) {
+          col_stddev[ifg][col] /= col_exec_count[ifg][col];
+        } else {
+          col_stddev[ifg][col] = 0.0;
+        }
       }
     }
 
@@ -911,36 +1262,46 @@ void Executor::writeFOMReport(const string& filename)
     // Print column summaries.
     //
     file <<left<< setw(kercol_width) << " ";
-    for (size_t iv = 0; iv < ncols; ++iv) {
-      file << sepchr << setw(fom_col_width) <<left<< "  " <<right<< pass;
+    for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) {
+      for (size_t col = 0; col < fom_group_ncols[ifg]; ++col) {
+        file << sepchr << setw(fom_col_width) <<left<< "  " <<right<< pass;
+      }
     }
     file << endl;
 
     file <<left<< setw(kercol_width) << "Col Min";
-    for (size_t col = 0; col < ncols; ++col) {
-      file << sepchr <<left<< setw(fom_col_width) << setprecision(prec)
-           << col_min[col] << pass;
+    for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) {
+      for (size_t col = 0; col < fom_group_ncols[ifg]; ++col) {
+        file << sepchr <<left<< setw(fom_col_width) << setprecision(prec)
+             << col_min[ifg][col] << pass;
+      }
     }
     file << endl;
 
     file <<left<< setw(kercol_width) << "Col Max";
-    for (size_t col = 0; col < ncols; ++col) {
-      file << sepchr <<left<< setw(fom_col_width) << setprecision(prec)
-           << col_max[col] << pass;
+    for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) {
+      for (size_t col = 0; col < fom_group_ncols[ifg]; ++col) {
+        file << sepchr <<left<< setw(fom_col_width) << setprecision(prec)
+             << col_max[ifg][col] << pass;
+      }
     }
     file << endl;
 
     file <<left<< setw(kercol_width) << "Col Avg";
-    for (size_t col = 0; col < ncols; ++col) {
-      file << sepchr <<left<< setw(fom_col_width) << setprecision(prec)
-           << col_avg[col] << pass;
+    for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) {
+      for (size_t col = 0; col < fom_group_ncols[ifg]; ++col) {
+        file << sepchr <<left<< setw(fom_col_width) << setprecision(prec)
+             << col_avg[ifg][col] << pass;
+      }
     }
     file << endl;
 
     file <<left<< setw(kercol_width) << "Col Std Dev";
-    for (size_t col = 0; col < ncols; ++col) {
-      file << sepchr <<left<< setw(fom_col_width) << setprecision(prec)
-           << col_stddev[col] << pass;
+    for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) {
+      for (size_t col = 0; col < fom_group_ncols[ifg]; ++col) {
+        file << sepchr <<left<< setw(fom_col_width) << setprecision(prec)
+             << col_stddev[ifg][col] << pass;
+      }
     }
     file << endl;
 
@@ -950,15 +1311,15 @@ void Executor::writeFOMReport(const string& filename)
 }
 
 
-void Executor::writeChecksumReport(const string& filename)
+void Executor::writeChecksumReport(ostream& file)
 {
-  ofstream file(filename.c_str(), ios::out | ios::trunc);
-  if ( !file ) {
-    cout << " ERROR: Can't open output file " << filename << endl;
-  }
-
   if ( file ) {
 
+#ifdef RAJA_PERFSUITE_ENABLE_MPI
+    int num_ranks;
+    MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);
+#endif
+
     //
     // Set basic table formatting parameters.
     //
@@ -973,10 +1334,13 @@ void Executor::writeChecksumReport(const string& filename)
     size_t namecol_width = 0;
     for (size_t ik = 0; ik < kernels.size(); ++ik) {
       namecol_width = max(namecol_width, kernels[ik]->getName().size());
-    }
-    for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
-      namecol_width = max(namecol_width,
-                          getVariantName(variant_ids[iv]).size());
+      for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
+        size_t var_width = getVariantName(variant_ids[iv]).size();
+        for (std::string const& tuning_name :
+             kernels[ik]->getVariantTuningNames(variant_ids[iv])) {
+          namecol_width = max(namecol_width, var_width+1+tuning_name.size());
+        }
+      }
     }
     namecol_width++;
 
@@ -985,7 +1349,11 @@ void Executor::writeChecksumReport(const string& filename)
     // Print title.
     //
     file << equal_line << endl;
-    file << "Checksum Report " << endl;
+    file << "Checksum Report ";
+#ifdef RAJA_PERFSUITE_ENABLE_MPI
+    file << "for " << num_ranks << " MPI ranks ";
+#endif
+    file << endl;
     file << equal_line << endl;
 
     //
@@ -994,10 +1362,22 @@ void Executor::writeChecksumReport(const string& filename)
     file <<left<< setw(namecol_width) << "Kernel  " << endl;
     file << dot_line << endl;
     file <<left<< setw(namecol_width) << "Variants  "
+#ifdef RAJA_PERFSUITE_ENABLE_MPI
+         <<left<< setw(checksum_width) << "Average Checksum  "
+         <<left<< setw(checksum_width) << "Max Checksum Diff  "
+         <<left<< setw(checksum_width) << "Checksum Diff StdDev"
+#else
          <<left<< setw(checksum_width) << "Checksum  "
-         <<left<< setw(checksum_width)
-         << "Checksum Diff (vs. first variant listed)";
-    file << endl;
+         <<left<< setw(checksum_width) << "Checksum Diff  "
+#endif
+         << endl;
+    file <<left<< setw(namecol_width) << "  "
+         <<left<< setw(checksum_width) << "  "
+         <<left<< setw(checksum_width) << "(vs. first variant listed)  "
+#ifdef RAJA_PERFSUITE_ENABLE_MPI
+         <<left<< setw(checksum_width) << ""
+#endif
+         << endl;
     file << dash_line << endl;
 
     //
@@ -1014,30 +1394,148 @@ void Executor::writeChecksumReport(const string& filename)
       bool found_ref = false;
       while ( ivck < variant_ids.size() && !found_ref ) {
         VariantID vid = variant_ids[ivck];
-        if ( kern->wasVariantRun(vid) ) {
-          cksum_ref = kern->getChecksum(vid);
-          found_ref = true;
+        size_t num_tunings = kern->getNumVariantTunings(vid);
+        for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) {
+          if ( kern->wasVariantTuningRun(vid, tune_idx) ) {
+            cksum_ref = kern->getChecksum(vid, tune_idx);
+            found_ref = true;
+            break;
+          }
         }
         ++ivck;
       }
 
+      // get vector of checksums and diffs
+      std::vector<std::vector<Checksum_type>> checksums(variant_ids.size());
+      std::vector<std::vector<Checksum_type>> checksums_diff(variant_ids.size());
       for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
         VariantID vid = variant_ids[iv];
+        size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]);
+
+        checksums[iv].resize(num_tunings, 0.0);
+        checksums_diff[iv].resize(num_tunings, 0.0);
+        for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) {
+          if ( kern->wasVariantTuningRun(vid, tune_idx) ) {
+            checksums[iv][tune_idx] = kern->getChecksum(vid, tune_idx);
+            checksums_diff[iv][tune_idx] = cksum_ref - kern->getChecksum(vid, tune_idx);
+          }
+        }
+      }
 
-        if ( kern->wasVariantRun(vid) ) {
-          Checksum_type vcheck_sum = kern->getChecksum(vid);
-          Checksum_type diff = cksum_ref - kern->getChecksum(vid);
+#ifdef RAJA_PERFSUITE_ENABLE_MPI
+      if (Checksum_MPI_type == MPI_DATATYPE_NULL) {
+        getCout() << "Checksum_MPI_type is invalid" << endl;
+      }
 
-          file <<left<< setw(namecol_width) << getVariantName(vid)
-               << showpoint << setprecision(prec)
-               <<left<< setw(checksum_width) << vcheck_sum
-               <<left<< setw(checksum_width) << diff << endl;
-        } else {
-          file <<left<< setw(namecol_width) << getVariantName(vid)
-               <<left<< setw(checksum_width) << "Not Run"
-               <<left<< setw(checksum_width) << "Not Run" << endl;
+      // get stats for checksums
+      std::vector<std::vector<Checksum_type>> checksums_sum(variant_ids.size());
+      for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
+        size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]);
+        checksums_sum[iv].resize(num_tunings, 0.0);
+        MPI_Allreduce(checksums[iv].data(), checksums_sum[iv].data(), num_tunings,
+                   Checksum_MPI_type, MPI_SUM, MPI_COMM_WORLD);
+      }
+
+      std::vector<std::vector<Checksum_type>> checksums_avg(variant_ids.size());
+      for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
+        size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]);
+        checksums_avg[iv].resize(num_tunings, 0.0);
+        for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) {
+          checksums_avg[iv][tune_idx] = checksums_sum[iv][tune_idx] / num_ranks;
+        }
+      }
+
+      // get stats for checksums_abs_diff
+      std::vector<std::vector<Checksum_type>> checksums_abs_diff(variant_ids.size());
+      for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
+        size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]);
+        checksums_abs_diff[iv].resize(num_tunings, 0.0);
+        for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) {
+          checksums_abs_diff[iv][tune_idx] = std::abs(checksums_diff[iv][tune_idx]);
+        }
+      }
+
+      std::vector<std::vector<Checksum_type>> checksums_abs_diff_min(variant_ids.size());
+      std::vector<std::vector<Checksum_type>> checksums_abs_diff_max(variant_ids.size());
+      std::vector<std::vector<Checksum_type>> checksums_abs_diff_sum(variant_ids.size());
+      for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
+        size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]);
+        checksums_abs_diff_min[iv].resize(num_tunings, 0.0);
+        checksums_abs_diff_max[iv].resize(num_tunings, 0.0);
+        checksums_abs_diff_sum[iv].resize(num_tunings, 0.0);
+
+        MPI_Allreduce(checksums_abs_diff[iv].data(), checksums_abs_diff_min[iv].data(), num_tunings,
+                   Checksum_MPI_type, MPI_MIN, MPI_COMM_WORLD);
+        MPI_Allreduce(checksums_abs_diff[iv].data(), checksums_abs_diff_max[iv].data(), num_tunings,
+                   Checksum_MPI_type, MPI_MAX, MPI_COMM_WORLD);
+        MPI_Allreduce(checksums_abs_diff[iv].data(), checksums_abs_diff_sum[iv].data(), num_tunings,
+                   Checksum_MPI_type, MPI_SUM, MPI_COMM_WORLD);
+      }
+
+      std::vector<std::vector<Checksum_type>> checksums_abs_diff_avg(variant_ids.size());
+      for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
+        size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]);
+        checksums_abs_diff_avg[iv].resize(num_tunings, 0.0);
+        for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) {
+          checksums_abs_diff_avg[iv][tune_idx] = checksums_abs_diff_sum[iv][tune_idx] / num_ranks;
         }
+      }
+
+      std::vector<std::vector<Checksum_type>> checksums_abs_diff_diff2avg2(variant_ids.size());
+      for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
+        size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]);
+        checksums_abs_diff_diff2avg2[iv].resize(num_tunings, 0.0);
+        for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) {
+          checksums_abs_diff_diff2avg2[iv][tune_idx] = (checksums_abs_diff[iv][tune_idx] - checksums_abs_diff_avg[iv][tune_idx]) *
+                                                  (checksums_abs_diff[iv][tune_idx] - checksums_abs_diff_avg[iv][tune_idx]) ;
+        }
+      }
+
+      std::vector<std::vector<Checksum_type>> checksums_abs_diff_stddev(variant_ids.size());
+      for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
+        size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]);
+        checksums_abs_diff_stddev[iv].resize(num_tunings, 0.0);
+        MPI_Allreduce(checksums_abs_diff_diff2avg2.data(), checksums_abs_diff_stddev.data(), num_tunings,
+                   Checksum_MPI_type, MPI_SUM, MPI_COMM_WORLD);
+        for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) {
+          checksums_abs_diff_stddev[iv][tune_idx] = std::sqrt(checksums_abs_diff_stddev[iv][tune_idx] / num_ranks);
+        }
+      }
+
+#endif
+
+      for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
+        VariantID vid = variant_ids[iv];
+        const string& variant_name = getVariantName(vid);
+
+        size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]);
+        for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) {
+          const string& tuning_name = kern->getVariantTuningName(vid, tune_idx);
+
+          if ( kern->wasVariantTuningRun(vid, tune_idx) ) {
+            file <<left<< setw(namecol_width) << (variant_name+"-"+tuning_name)
+                 << showpoint << setprecision(prec)
+#ifdef RAJA_PERFSUITE_ENABLE_MPI
+                 <<left<< setw(checksum_width) << checksums_avg[iv][tune_idx]
+                 <<left<< setw(checksum_width) << checksums_abs_diff_max[iv][tune_idx]
+                 <<left<< setw(checksum_width) << checksums_abs_diff_stddev[iv][tune_idx] << endl;
+#else
+                 <<left<< setw(checksum_width) << checksums[iv][tune_idx]
+                 <<left<< setw(checksum_width) << checksums_diff[iv][tune_idx] << endl;
+#endif
+          } else {
+            file <<left<< setw(namecol_width) << (variant_name+"-"+tuning_name)
+#ifdef RAJA_PERFSUITE_ENABLE_MPI
+                 <<left<< setw(checksum_width) << "Not Run"
+                 <<left<< setw(checksum_width) << "Not Run"
+                 <<left<< setw(checksum_width) << "Not Run" << endl;
+#else
+                 <<left<< setw(checksum_width) << "Not Run"
+                 <<left<< setw(checksum_width) << "Not Run" << endl;
+#endif
+          }
 
+        }
       }
 
       file << endl;
@@ -1050,56 +1548,105 @@ void Executor::writeChecksumReport(const string& filename)
 }
 
 
-string Executor::getReportTitle(CSVRepMode mode)
+string Executor::getReportTitle(CSVRepMode mode, RunParams::CombinerOpt combiner)
 {
   string title;
+  switch ( combiner ) {
+    case RunParams::CombinerOpt::Average : {
+      title = string("Mean ");
+    }
+    break;
+    case RunParams::CombinerOpt::Minimum : {
+      title = string("Min ");
+    }
+    break;
+    case RunParams::CombinerOpt::Maximum : {
+      title = string("Max ");
+    }
+    break;
+    default : { cout << "\n Unknown CSV combiner mode = " << combiner << endl; }
+  }
   switch ( mode ) {
     case CSVRepMode::Timing : {
-      title = string("Mean Runtime Report (sec.) ");
+      title += string("Runtime Report (sec.) ");
       break;
     }
     case CSVRepMode::Speedup : {
       if ( haveReferenceVariant() ) {
-        title = string("Speedup Report (T_ref/T_var)") +
-                string(": ref var = ") + getVariantName(reference_vid) +
-                string(" ");
+        title += string("Speedup Report (T_ref/T_var)") +
+                 string(": ref var = ") + getVariantName(reference_vid) +
+                 string(" ");
       }
       break;
     }
-    default : { cout << "\n Unknown CSV report mode = " << mode << endl; }
+    default : { getCout() << "\n Unknown CSV report mode = " << mode << endl; }
   };
   return title;
 }
 
 long double Executor::getReportDataEntry(CSVRepMode mode,
+                                         RunParams::CombinerOpt combiner,
                                          KernelBase* kern,
-                                         VariantID vid)
+                                         VariantID vid,
+                                         size_t tune_idx)
 {
   long double retval = 0.0;
   switch ( mode ) {
     case CSVRepMode::Timing : {
-      retval = kern->getTotTime(vid) / run_params.getNumPasses();
+      switch ( combiner ) {
+        case RunParams::CombinerOpt::Average : {
+          retval = kern->getTotTime(vid, tune_idx) / run_params.getNumPasses();
+        }
+        break;
+        case RunParams::CombinerOpt::Minimum : {
+          retval = kern->getMinTime(vid, tune_idx);
+        }
+        break;
+        case RunParams::CombinerOpt::Maximum : {
+          retval = kern->getMaxTime(vid, tune_idx);
+        }
+        break;
+        default : { cout << "\n Unknown CSV combiner mode = " << combiner << endl; }
+      }
       break;
     }
     case CSVRepMode::Speedup : {
       if ( haveReferenceVariant() ) {
-        if ( kern->hasVariantDefined(reference_vid) &&
-             kern->hasVariantDefined(vid) ) {
-          retval = kern->getTotTime(reference_vid) / kern->getTotTime(vid);
+        if ( kern->hasVariantTuningDefined(reference_vid, reference_tune_idx) &&
+             kern->hasVariantTuningDefined(vid, tune_idx) ) {
+          switch ( combiner ) {
+            case RunParams::CombinerOpt::Average : {
+              retval = kern->getTotTime(reference_vid, reference_tune_idx) /
+                       kern->getTotTime(vid, tune_idx);
+            }
+            break;
+            case RunParams::CombinerOpt::Minimum : {
+              retval = kern->getMinTime(reference_vid, reference_tune_idx) /
+                       kern->getMinTime(vid, tune_idx);
+            }
+            break;
+            case RunParams::CombinerOpt::Maximum : {
+              retval = kern->getMaxTime(reference_vid, reference_tune_idx) /
+                       kern->getMaxTime(vid, tune_idx);
+            }
+            break;
+            default : { cout << "\n Unknown CSV combiner mode = " << combiner << endl; }
+          }
         } else {
           retval = 0.0;
         }
 #if 0 // RDH DEBUG  (leave this here, it's useful for debugging!)
-        cout << "Kernel(iv): " << kern->getName() << "(" << vid << ")" << endl;
-        cout << "\tref_time, tot_time, retval = "
-             << kern->getTotTime(reference_vid) << " , "
-             << kern->getTotTime(vid) << " , "
+        getCout() << "Kernel(iv): " << kern->getName() << "(" << vid << ")"
+                                                       << "(" << tune_idx << ")"endl;
+        getCout() << "\tref_time, tot_time, retval = "
+             << kern->getTotTime(reference_vid, reference_tune_idx) << " , "
+             << kern->getTotTime(vid, tune_idx) << " , "
              << retval << endl;
 #endif
       }
       break;
     }
-    default : { cout << "\n Unknown CSV report mode = " << mode << endl; }
+    default : { getCout() << "\n Unknown CSV report mode = " << mode << endl; }
   };
   return retval;
 }
@@ -1115,7 +1662,7 @@ void Executor::getFOMGroups(vector<FOMGroup>& fom_groups)
     if ( vname.find("Base") != string::npos ) {
 
       FOMGroup group;
-      group.base = vid;
+      group.variants.push_back(vid);
 
       string::size_type pos = vname.find("_");
       string pm(vname.substr(pos+1, string::npos));
@@ -1136,12 +1683,12 @@ void Executor::getFOMGroups(vector<FOMGroup>& fom_groups)
   }  // iterate over variant ids to run
 
 #if 0 //  RDH DEBUG   (leave this here, it's useful for debugging!)
-  cout << "\nFOMGroups..." << endl;
+  getCout() << "\nFOMGroups..." << endl;
   for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) {
     const FOMGroup& group = fom_groups[ifg];
-    cout << "\tBase : " << getVariantName(group.base) << endl;
+    getCout() << "\tBase : " << getVariantName(group.base) << endl;
     for (size_t iv = 0; iv < group.variants.size(); ++iv) {
-      cout << "\t\t " << getVariantName(group.variants[iv]) << endl;
+      getCout() << "\t\t " << getVariantName(group.variants[iv]) << endl;
     }
   }
 #endif
diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp
index 32e978f9a..a4403f1eb 100644
--- a/src/common/Executor.hpp
+++ b/src/common/Executor.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,6 +13,8 @@
 #include "common/RunParams.hpp"
 
 #include <iosfwd>
+#include <streambuf>
+#include <memory>
 #include <utility>
 #include <set>
 
@@ -24,7 +26,8 @@ class WarmupKernel;
 /*!
  *******************************************************************************
  *
- * \brief Class that assembles kernels and variants to run and executes them.
+ * \brief Class that assembles kernels, variants, and tunings to run and
+ *        executes them.
  *
  *******************************************************************************
  */
@@ -54,30 +57,44 @@ class Executor
   };
 
   struct FOMGroup {
-    VariantID base;
     std::vector<VariantID> variants;
   };
 
+  template < typename Kernel >
+  KernelBase* makeKernel();
+
+  void runKernel(KernelBase* kern, bool print_kernel_name);
+
+  std::unique_ptr<std::ostream> openOutputFile(const std::string& filename) const;
+
   bool haveReferenceVariant() { return reference_vid < NumVariants; }
 
   void writeKernelInfoSummary(std::ostream& str, bool to_file) const;
 
-  void writeCSVReport(const std::string& filename, CSVRepMode mode,
-                      size_t prec);
-  std::string getReportTitle(CSVRepMode mode);
-  long double getReportDataEntry(CSVRepMode mode,
-                                 KernelBase* kern, VariantID vid);
+  void writeCSVReport(std::ostream& file, CSVRepMode mode,
+                      RunParams::CombinerOpt combiner, size_t prec);
+  std::string getReportTitle(CSVRepMode mode, RunParams::CombinerOpt combiner);
+  long double getReportDataEntry(CSVRepMode mode, RunParams::CombinerOpt combiner,
+                                 KernelBase* kern, VariantID vid, size_t tune_idx);
 
-  void writeChecksumReport(const std::string& filename);
+  void writeChecksumReport(std::ostream& file);
 
-  void writeFOMReport(const std::string& filename);
+  void writeFOMReport(std::ostream& file, std::vector<FOMGroup>& fom_groups);
   void getFOMGroups(std::vector<FOMGroup>& fom_groups);
 
   RunParams run_params;
   std::vector<KernelBase*> kernels;
   std::vector<VariantID>   variant_ids;
+  std::vector<std::string> tuning_names[NumVariants];
 
   VariantID reference_vid;
+  size_t    reference_tune_idx;
+
+public:
+  // Methods for verification testing in CI.
+  std::vector<KernelBase*> getKernels() const { return kernels; }
+  std::vector<VariantID> getVariantIDs() const { return variant_ids; }
+
 };
 
 }  // closing brace for rajaperf namespace
diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp
new file mode 100644
index 000000000..76362ee1c
--- /dev/null
+++ b/src/common/GPUUtils.hpp
@@ -0,0 +1,197 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Methods and classes for GPU kernel templates.
+///
+
+
+#ifndef RAJAPerf_GPUUtils_HPP
+#define RAJAPerf_GPUUtils_HPP
+
+#include "rajaperf_config.hpp"
+
+namespace rajaperf
+{
+
+namespace gpu_block_size
+{
+
+namespace detail
+{
+
+// implementation of sqrt via binary search
+// copied from https://stackoverflow.com/questions/8622256/in-c11-is-sqrt-defined-as-constexpr
+constexpr size_t sqrt_helper(size_t n, size_t lo, size_t hi)
+{
+  return (lo == hi)
+           ? lo // search complete
+           : ((n / ((lo + hi + 1) / 2) < ((lo + hi + 1) / 2))
+                ? sqrt_helper(n, lo, ((lo + hi + 1) / 2)-1) // search lower half
+                : sqrt_helper(n, ((lo + hi + 1) / 2), hi)); // search upper half
+}
+
+// implementation of lesser_of_squarest_factor_pair via linear search
+constexpr size_t lesser_of_squarest_factor_pair_helper(size_t n, size_t guess)
+{
+  return ((n / guess) * guess == n)
+           ? guess // search complete, guess is a factor
+           : lesser_of_squarest_factor_pair_helper(n, guess - 1); // continue searching
+}
+
+// class to get the size of a camp::int_seq
+template < typename IntSeq >
+struct SizeOfIntSeq;
+///
+template < size_t... Is >
+struct SizeOfIntSeq<camp::int_seq<size_t, Is...>>
+{
+   static const size_t size = sizeof...(Is);
+};
+
+// class to help prepend integers to a list
+// this is used for the false case where I is not prepended to IntSeq
+template < bool B, size_t I, typename IntSeq >
+struct conditional_prepend
+{
+  using type = IntSeq;
+};
+/// this is used for the true case where I is prepended to IntSeq
+template < size_t I, size_t... Is >
+struct conditional_prepend<true, I, camp::int_seq<size_t, Is...>>
+{
+  using type = camp::int_seq<size_t, I, Is...>;
+};
+
+// class to help create a sequence that is only the valid values in IntSeq
+template < typename validity_checker, typename IntSeq >
+struct remove_invalid;
+
+// base case where the list is empty, use the empty list
+template < typename validity_checker >
+struct remove_invalid<validity_checker, camp::int_seq<size_t>>
+{
+  using type = camp::int_seq<size_t>;
+};
+
+// check validity of I and conditionally prepend I to a recursively generated
+// list of valid values
+template < typename validity_checker, size_t I, size_t... Is >
+struct remove_invalid<validity_checker, camp::int_seq<size_t, I, Is...>>
+{
+  using type = typename conditional_prepend<
+      validity_checker::template valid<I>(),
+      I,
+      typename remove_invalid<validity_checker, camp::int_seq<size_t, Is...>>::type
+    >::type;
+};
+
+} // namespace detail
+
+// constexpr integer sqrt
+constexpr size_t sqrt(size_t n)
+{
+  return detail::sqrt_helper(n, 0, n/2 + 1);
+}
+
+// constexpr return the lesser of the most square pair of factors of n
+// ex. 12 has pairs of factors (1, 12) (2, 6) *(3, 4)* and returns 3
+constexpr size_t lesser_of_squarest_factor_pair(size_t n)
+{
+  return (n == 0)
+      ? 0 // return 0 in the 0 case
+      : detail::lesser_of_squarest_factor_pair_helper(n, sqrt(n));
+}
+// constexpr return the greater of the most square pair of factors of n
+// ex. 12 has pairs of factors (1, 12) (2, 6) *(3, 4)* and returns 4
+constexpr size_t greater_of_squarest_factor_pair(size_t n)
+{
+  return (n == 0)
+      ? 0 // return 0 in the 0 case
+      : n / detail::lesser_of_squarest_factor_pair_helper(n, sqrt(n));
+}
+
+// always true
+struct AllowAny
+{
+  template < size_t I >
+  static constexpr bool valid() { return true; }
+};
+
+// true if of I is a multiple of N, false otherwise
+template < size_t N >
+struct MultipleOf
+{
+  template < size_t I >
+  static constexpr bool valid() { return (I/N)*N == I; }
+};
+
+// true if the sqrt of I is representable as a size_t, false otherwise
+struct ExactSqrt
+{
+  template < size_t I >
+  static constexpr bool valid() { return sqrt(I)*sqrt(I) == I; }
+};
+
+template < size_t... block_sizes >
+using list_type = camp::int_seq<size_t, block_sizes...>;
+
+// A camp::int_seq of size_t's that is rajaperf::configuration::gpu_block_sizes
+// if rajaperf::configuration::gpu_block_sizes is not empty
+// and a camp::int_seq of default_block_size otherwise
+// with invalid entries removed according to validity_checker
+template < size_t default_block_size, typename validity_checker = AllowAny >
+using make_list_type =
+      typename detail::remove_invalid<validity_checker,
+        typename std::conditional< (detail::SizeOfIntSeq<rajaperf::configuration::gpu_block_sizes>::size > 0),
+          rajaperf::configuration::gpu_block_sizes,
+          list_type<default_block_size>
+        >::type
+      >::type;
+
+} // closing brace for gpu_block_size namespace
+
+//compile time loop over an integer sequence
+//this allows for creating a loop over a compile time constant variable
+template <typename Func, typename T, T... ts>
+inline void seq_for(camp::int_seq<T, ts...> const&, Func&& func)
+{
+  // braced init lists are evaluated in order
+  int seq_unused_array[] = {(func(camp::integral_constant<T,ts>{}), 0)...};
+  RAJAPERF_UNUSED_VAR(seq_unused_array);
+}
+
+} // closing brace for rajaperf namespace
+
+//
+#define RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(kernel, variant)     \
+  void kernel::run##variant##Variant(VariantID vid, size_t tune_idx)           \
+  {                                                                            \
+    size_t t = 0;                                                              \
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {                     \
+      if (run_params.numValidGPUBlockSize() == 0u ||                           \
+          run_params.validGPUBlockSize(block_size)) {                          \
+        if (tune_idx == t) {                                                   \
+          run##variant##VariantImpl<block_size>(vid);                          \
+        }                                                                      \
+        t += 1;                                                                \
+      }                                                                        \
+    });                                                                        \
+  }                                                                            \
+                                                                               \
+  void kernel::set##variant##TuningDefinitions(VariantID vid)                  \
+  {                                                                            \
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {                     \
+      if (run_params.numValidGPUBlockSize() == 0u ||                           \
+          run_params.validGPUBlockSize(block_size)) {                          \
+        addVariantTuningName(vid, "block_"+std::to_string(block_size));        \
+      }                                                                        \
+    });                                                                        \
+  }
+
+#endif  // closing endif for header file include guard
diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp
index 550563d2f..a3871d31e 100644
--- a/src/common/HipDataUtils.hpp
+++ b/src/common/HipDataUtils.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,6 +18,7 @@
 
 #if defined(RAJA_ENABLE_HIP)
 
+#include "common/GPUUtils.hpp"
 
 #include "RAJA/policy/hip/raja_hiperrchk.hpp"
 
@@ -31,16 +32,37 @@ namespace rajaperf
 template < typename Lambda >
 __global__ void lambda_hip_forall(Index_type ibegin, Index_type iend, Lambda body)
 {
-   Index_type i = ibegin + blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     body(i);
-   }
+  Index_type i = ibegin + blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < iend) {
+    body(i);
+  }
+}
+///
+template < size_t block_size, typename Lambda >
+__launch_bounds__(block_size)
+__global__ void lambda_hip_forall(Index_type ibegin, Index_type iend, Lambda body)
+{
+  Index_type i = ibegin + blockIdx.x * block_size + threadIdx.x;
+  if (i < iend) {
+    body(i);
+  }
 }
 
 /*!
-* \brief Simple hip kernel that runs a lambda.
-*/
-template <typename Lambda> __global__ void lambda_hip(Lambda body) { body(); }
+ * \brief Simple hip kernel that runs a lambda.
+ */
+template < typename Lambda >
+__global__ void lambda_hip(Lambda body)
+{
+  body();
+}
+///
+template < size_t block_size, typename Lambda >
+__launch_bounds__(block_size)
+__global__ void lambda_hip(Lambda body)
+{
+  body();
+}
 
 /*!
  * \brief Getters for hip kernel indices.
diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp
index 69d195700..a07a6bbbb 100644
--- a/src/common/KernelBase.cpp
+++ b/src/common/KernelBase.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -11,11 +11,12 @@
 #include "RunParams.hpp"
 
 #include <cmath>
+#include <limits>
 
 namespace rajaperf {
 
 KernelBase::KernelBase(KernelID kid, const RunParams& params) :
-  run_params(params) 
+  run_params(params)
 {
   kernel_id = kid;
   name = getFullKernelName(kernel_id);
@@ -24,44 +25,33 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params) :
   default_reps = -1;
 
   actual_prob_size = -1;
- 
+
   for (size_t fid = 0; fid < NumFeatures; ++fid) {
     uses_feature[fid] = false;
   }
 
-  for (size_t vid = 0; vid < NumVariants; ++vid) {
-    has_variant_defined[vid] = false;
-  }
-
   its_per_rep = -1;
   kernels_per_rep = -1;
   bytes_per_rep = -1;
   FLOPs_per_rep = -1;
 
   running_variant = NumVariants;
+  running_tuning = getUnknownTuningIdx();
 
   checksum_scale_factor = 1.0;
-
-  for (size_t vid = 0; vid < NumVariants; ++vid) {
-    checksum[vid] = 0.0;
-    num_exec[vid] = 0;
-    min_time[vid] = std::numeric_limits<double>::max();
-    max_time[vid] = -std::numeric_limits<double>::max();
-    tot_time[vid] = 0.0;
-  }
 }
 
- 
+
 KernelBase::~KernelBase()
 {
 }
 
 
 Index_type KernelBase::getTargetProblemSize() const
-{ 
+{
   Index_type target_size = static_cast<Index_type>(0);
   if (run_params.getSizeMeaning() == RunParams::SizeMeaning::Factor) {
-    target_size = 
+    target_size =
       static_cast<Index_type>(default_prob_size*run_params.getSizeFactor());
   } else if (run_params.getSizeMeaning() == RunParams::SizeMeaning::Direct) {
     target_size = static_cast<Index_type>(run_params.getSize());
@@ -70,53 +60,126 @@ Index_type KernelBase::getTargetProblemSize() const
 }
 
 Index_type KernelBase::getRunReps() const
-{ 
+{
   Index_type run_reps = static_cast<Index_type>(0);
   if (run_params.getInputState() == RunParams::CheckRun) {
     run_reps = static_cast<Index_type>(run_params.getCheckRunReps());
   } else {
-    run_reps = static_cast<Index_type>(default_reps*run_params.getRepFactor()); 
+    run_reps = static_cast<Index_type>(default_reps*run_params.getRepFactor());
   }
   return run_reps;
 }
 
-void KernelBase::setVariantDefined(VariantID vid) 
+void KernelBase::setVariantDefined(VariantID vid)
 {
-  has_variant_defined[vid] = isVariantAvailable(vid); 
-}
+  if (!isVariantAvailable(vid)) return;
+
+  switch ( vid ) {
+
+    case Base_Seq :
+    {
+      setSeqTuningDefinitions(vid);
+      break;
+    }
+
+    case Lambda_Seq :
+    case RAJA_Seq :
+    {
+#if defined(RUN_RAJA_SEQ)
+      setSeqTuningDefinitions(vid);
+#endif
+      break;
+    }
 
+    case Base_OpenMP :
+    case Lambda_OpenMP :
+    case RAJA_OpenMP :
+    {
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+      setOpenMPTuningDefinitions(vid);
+#endif
+      break;
+    }
 
-void KernelBase::execute(VariantID vid) 
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+      setOpenMPTargetTuningDefinitions(vid);
+#endif
+      break;
+    }
+
+    case Base_CUDA :
+    case Lambda_CUDA :
+    case RAJA_CUDA :
+    {
+#if defined(RAJA_ENABLE_CUDA)
+      setCudaTuningDefinitions(vid);
+#endif
+      break;
+    }
+
+    case Base_HIP :
+    case Lambda_HIP :
+    case RAJA_HIP :
+    {
+#if defined(RAJA_ENABLE_HIP)
+      setHipTuningDefinitions(vid);
+#endif
+      break;
+    }
+
+    default : {
+#if 0
+      getCout() << "\n  " << getName()
+                << " : Unknown variant id = " << vid << std::endl;
+#endif
+    }
+  }
+
+  checksum[vid].resize(variant_tuning_names[vid].size(), 0.0);
+  num_exec[vid].resize(variant_tuning_names[vid].size(), 0);
+  min_time[vid].resize(variant_tuning_names[vid].size(), std::numeric_limits<double>::max());
+  max_time[vid].resize(variant_tuning_names[vid].size(), -std::numeric_limits<double>::max());
+  tot_time[vid].resize(variant_tuning_names[vid].size(), 0.0);
+}
+
+void KernelBase::execute(VariantID vid, size_t tune_idx)
 {
   running_variant = vid;
+  running_tuning = tune_idx;
 
   resetTimer();
 
   resetDataInitCount();
-  this->setUp(vid);
-  
-  this->runKernel(vid); 
+  this->setUp(vid, tune_idx);
+
+  this->runKernel(vid, tune_idx);
 
-  this->updateChecksum(vid); 
+  this->updateChecksum(vid, tune_idx);
 
-  this->tearDown(vid);
+  this->tearDown(vid, tune_idx);
 
-  running_variant = NumVariants; 
+  running_variant = NumVariants;
+  running_tuning = getUnknownTuningIdx();
 }
 
 void KernelBase::recordExecTime()
 {
-  num_exec[running_variant]++;
+  num_exec[running_variant].at(running_tuning)++;
 
   RAJA::Timer::ElapsedType exec_time = timer.elapsed();
-  min_time[running_variant] = std::min(min_time[running_variant], exec_time);
-  max_time[running_variant] = std::max(max_time[running_variant], exec_time);
-  tot_time[running_variant] += exec_time;
+  min_time[running_variant].at(running_tuning) =
+      std::min(min_time[running_variant].at(running_tuning), exec_time);
+  max_time[running_variant].at(running_tuning) =
+      std::max(max_time[running_variant].at(running_tuning), exec_time);
+  tot_time[running_variant].at(running_tuning) += exec_time;
 }
 
-void KernelBase::runKernel(VariantID vid)
+void KernelBase::runKernel(VariantID vid, size_t tune_idx)
 {
-  if ( !has_variant_defined[vid] ) {
+  if ( !hasVariantDefined(vid) ) {
     return;
   }
 
@@ -124,7 +187,7 @@ void KernelBase::runKernel(VariantID vid)
 
     case Base_Seq :
     {
-      runSeqVariant(vid);
+      runSeqVariant(vid, tune_idx);
       break;
     }
 
@@ -132,7 +195,7 @@ void KernelBase::runKernel(VariantID vid)
     case RAJA_Seq :
     {
 #if defined(RUN_RAJA_SEQ)
-      runSeqVariant(vid);
+      runSeqVariant(vid, tune_idx);
 #endif
       break;
     }
@@ -142,7 +205,7 @@ void KernelBase::runKernel(VariantID vid)
     case RAJA_OpenMP :
     {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
-      runOpenMPVariant(vid);
+      runOpenMPVariant(vid, tune_idx);
 #endif
       break;
     }
@@ -151,7 +214,7 @@ void KernelBase::runKernel(VariantID vid)
     case RAJA_OpenMPTarget :
     {
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-      runOpenMPTargetVariant(vid);
+      runOpenMPTargetVariant(vid, tune_idx);
 #endif
       break;
     }
@@ -161,7 +224,7 @@ void KernelBase::runKernel(VariantID vid)
     case RAJA_CUDA :
     {
 #if defined(RAJA_ENABLE_CUDA)
-      runCudaVariant(vid);
+      runCudaVariant(vid, tune_idx);
 #endif
       break;
     }
@@ -171,14 +234,14 @@ void KernelBase::runKernel(VariantID vid)
     case RAJA_HIP :
     {
 #if defined(RAJA_ENABLE_HIP)
-      runHipVariant(vid);
+      runHipVariant(vid, tune_idx);
 #endif
       break;
     }
 
     default : {
 #if 0
-      std::cout << "\n  " << getName() 
+      getCout() << "\n  " << getName()
                 << " : Unknown variant id = " << vid << std::endl;
 #endif
     }
@@ -195,13 +258,17 @@ void KernelBase::print(std::ostream& os) const
   os << "\t\t\t actual_prob_size = " << actual_prob_size << std::endl;
   os << "\t\t\t uses_feature: " << std::endl;
   for (unsigned j = 0; j < NumFeatures; ++j) {
-    os << "\t\t\t\t" << getFeatureName(static_cast<FeatureID>(j)) 
-                     << " : " << uses_feature[j] << std::endl; 
+    os << "\t\t\t\t" << getFeatureName(static_cast<FeatureID>(j))
+                     << " : " << uses_feature[j] << std::endl;
   }
-  os << "\t\t\t has_variant_defined: " << std::endl;
+  os << "\t\t\t variant_tuning_names: " << std::endl;
   for (unsigned j = 0; j < NumVariants; ++j) {
-    os << "\t\t\t\t" << getVariantName(static_cast<VariantID>(j)) 
-                     << " : " << has_variant_defined[j] << std::endl; 
+    os << "\t\t\t\t" << getVariantName(static_cast<VariantID>(j))
+                     << " :" << std::endl;
+    for (size_t t = 0; t < variant_tuning_names[j].size(); ++t) {
+      os << "\t\t\t\t\t" << getVariantTuningName(static_cast<VariantID>(j), t)
+                         << std::endl;
+    }
   }
   os << "\t\t\t its_per_rep = " << its_per_rep << std::endl;
   os << "\t\t\t kernels_per_rep = " << kernels_per_rep << std::endl;
@@ -209,28 +276,43 @@ void KernelBase::print(std::ostream& os) const
   os << "\t\t\t FLOPs_per_rep = " << FLOPs_per_rep << std::endl;
   os << "\t\t\t num_exec: " << std::endl;
   for (unsigned j = 0; j < NumVariants; ++j) {
-    os << "\t\t\t\t" << getVariantName(static_cast<VariantID>(j)) 
-                     << " : " << num_exec[j] << std::endl; 
+    os << "\t\t\t\t" << getVariantName(static_cast<VariantID>(j))
+                     << " :" << std::endl;
+    for (size_t t = 0; t < num_exec[j].size(); ++t) {
+      os << "\t\t\t\t\t" << num_exec[j][t] << std::endl;
+    }
   }
   os << "\t\t\t min_time: " << std::endl;
   for (unsigned j = 0; j < NumVariants; ++j) {
-    os << "\t\t\t\t" << getVariantName(static_cast<VariantID>(j)) 
-                     << " : " << min_time[j] << std::endl; 
+    os << "\t\t\t\t" << getVariantName(static_cast<VariantID>(j))
+                     << " :" << std::endl;
+    for (size_t t = 0; t < min_time[j].size(); ++t) {
+      os << "\t\t\t\t\t" << min_time[j][t] << std::endl;
+    }
   }
   os << "\t\t\t max_time: " << std::endl;
   for (unsigned j = 0; j < NumVariants; ++j) {
-    os << "\t\t\t\t" << getVariantName(static_cast<VariantID>(j)) 
-                     << " : " << max_time[j] << std::endl; 
+    os << "\t\t\t\t" << getVariantName(static_cast<VariantID>(j))
+                     << " :" << std::endl;
+    for (size_t t = 0; t < max_time[j].size(); ++t) {
+      os << "\t\t\t\t\t" << max_time[j][t] << std::endl;
+    }
   }
   os << "\t\t\t tot_time: " << std::endl;
   for (unsigned j = 0; j < NumVariants; ++j) {
-    os << "\t\t\t\t" << getVariantName(static_cast<VariantID>(j)) 
-                     << " : " << tot_time[j] << std::endl; 
+    os << "\t\t\t\t" << getVariantName(static_cast<VariantID>(j))
+                     << " :" << std::endl;
+    for (size_t t = 0; t < tot_time[j].size(); ++t) {
+      os << "\t\t\t\t\t" << tot_time[j][t] << std::endl;
+    }
   }
   os << "\t\t\t checksum: " << std::endl;
   for (unsigned j = 0; j < NumVariants; ++j) {
-    os << "\t\t\t\t" << getVariantName(static_cast<VariantID>(j)) 
-                     << " : " << checksum[j] << std::endl; 
+    os << "\t\t\t\t" << getVariantName(static_cast<VariantID>(j))
+                     << " :" << std::endl;
+    for (size_t t = 0; t < checksum[j].size(); ++t) {
+      os << "\t\t\t\t\t" << checksum[j][t] << std::endl;
+    }
   }
   os << std::endl;
 }
diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp
index 8c2dfb799..8d74d6e05 100644
--- a/src/common/KernelBase.hpp
+++ b/src/common/KernelBase.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,8 +13,12 @@
 #include "common/RPTypes.hpp"
 #include "common/DataUtils.hpp"
 #include "common/RunParams.hpp"
+#include "common/GPUUtils.hpp"
 
 #include "RAJA/util/Timer.hpp"
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+#include <mpi.h>
+#endif
 #if defined(RAJA_ENABLE_CUDA)
 #include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
 #endif
@@ -23,6 +27,7 @@
 #endif
 
 #include <string>
+#include <vector>
 #include <iostream>
 #include <limits>
 
@@ -38,6 +43,10 @@ namespace rajaperf {
 class KernelBase
 {
 public:
+  static constexpr size_t getUnknownTuningIdx()
+    { return std::numeric_limits<size_t>::max(); }
+  static std::string getDefaultTuningName() { return "default"; }
+
   KernelBase(KernelID kid, const RunParams& params);
 
   virtual ~KernelBase();
@@ -60,6 +69,27 @@ class KernelBase
 
   void setUsesFeature(FeatureID fid) { uses_feature[fid] = true; }
   void setVariantDefined(VariantID vid);
+  void addVariantTuningName(VariantID vid, std::string name)
+  { variant_tuning_names[vid].emplace_back(std::move(name)); }
+
+  virtual void setSeqTuningDefinitions(VariantID vid)
+  { addVariantTuningName(vid, getDefaultTuningName()); }
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+  virtual void setOpenMPTuningDefinitions(VariantID vid)
+  { addVariantTuningName(vid, getDefaultTuningName()); }
+#endif
+#if defined(RAJA_ENABLE_CUDA)
+  virtual void setCudaTuningDefinitions(VariantID vid)
+  { addVariantTuningName(vid, getDefaultTuningName()); }
+#endif
+#if defined(RAJA_ENABLE_HIP)
+  virtual void setHipTuningDefinitions(VariantID vid)
+  { addVariantTuningName(vid, getDefaultTuningName()); }
+#endif
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+  virtual void setOpenMPTargetTuningDefinitions(VariantID vid)
+  { addVariantTuningName(vid, getDefaultTuningName()); }
+#endif
 
   //
   // Getter methods used to generate kernel execution summary
@@ -80,22 +110,61 @@ class KernelBase
   bool usesFeature(FeatureID fid) const { return uses_feature[fid]; };
 
   bool hasVariantDefined(VariantID vid) const
-    { return has_variant_defined[vid]; }
-
+    { return !variant_tuning_names[vid].empty(); }
+  bool hasVariantTuningDefined(VariantID vid, size_t tune_idx) const
+    {
+      if (hasVariantDefined(vid) && tune_idx < getNumVariantTunings(vid)) {
+        return true;
+      }
+      return false;
+    }
+  bool hasVariantTuningDefined(VariantID vid, std::string const& tuning_name) const
+    {
+      if (hasVariantDefined(vid)) {
+        for (std::string const& a_tuning_name : getVariantTuningNames(vid)) {
+          if (tuning_name == a_tuning_name) { return true; }
+        }
+      }
+      return false;
+    }
+  size_t getVariantTuningIndex(VariantID vid, std::string const& tuning_name) const
+    {
+      std::vector<std::string> const& tuning_names = getVariantTuningNames(vid);
+      for (size_t t = 0; t < tuning_names.size(); ++t) {
+        std::string const& a_tuning_name = tuning_names[t];
+        if (tuning_name == a_tuning_name) { return t; }
+      }
+      return getUnknownTuningIdx();
+    }
+  size_t getNumVariantTunings(VariantID vid) const
+    { return getVariantTuningNames(vid).size(); }
+  std::string const& getVariantTuningName(VariantID vid, size_t tune_idx) const
+    { return getVariantTuningNames(vid).at(tune_idx); }
+  std::vector<std::string> const& getVariantTuningNames(VariantID vid) const
+    { return variant_tuning_names[vid]; }
 
   //
   // Methods to get information about kernel execution for reports
   // containing kernel execution information
   //
-  bool wasVariantRun(VariantID vid) const
-    { return num_exec[vid] > 0; }
+  bool wasVariantTuningRun(VariantID vid, size_t tune_idx) const
+    {
+      if (tune_idx != getUnknownTuningIdx()) {
+        return num_exec[vid].at(tune_idx) > 0;
+      }
+      return false;
+    }
+
+  // get runtime of executed variant/tuning
+  double getLastTime() const { return timer.elapsed(); }
 
-  double getMinTime(VariantID vid) const { return min_time[vid]; }
-  double getMaxTime(VariantID vid) const { return max_time[vid]; }
-  double getTotTime(VariantID vid) { return tot_time[vid]; }
-  Checksum_type getChecksum(VariantID vid) const { return checksum[vid]; }
+  // get timers accumulated over npasses
+  double getMinTime(VariantID vid, size_t tune_idx) const { return min_time[vid].at(tune_idx); }
+  double getMaxTime(VariantID vid, size_t tune_idx) const { return max_time[vid].at(tune_idx); }
+  double getTotTime(VariantID vid, size_t tune_idx) { return tot_time[vid].at(tune_idx); }
+  Checksum_type getChecksum(VariantID vid, size_t tune_idx) const { return checksum[vid].at(tune_idx); }
 
-  void execute(VariantID vid);
+  void execute(VariantID vid, size_t tune_idx);
 
   void synchronize()
   {
@@ -118,12 +187,18 @@ class KernelBase
   void startTimer()
   {
     synchronize();
+#ifdef RAJA_PERFSUITE_ENABLE_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
     timer.start();
   }
 
   void stopTimer()
   {
     synchronize();
+#ifdef RAJA_PERFSUITE_ENABLE_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
     timer.stop(); recordExecTime();
   }
 
@@ -136,30 +211,30 @@ class KernelBase
 
   virtual void print(std::ostream& os) const;
 
-  virtual void runKernel(VariantID vid);
+  virtual void runKernel(VariantID vid, size_t tune_idx);
 
-  virtual void setUp(VariantID vid) = 0;
-  virtual void updateChecksum(VariantID vid) = 0;
-  virtual void tearDown(VariantID vid) = 0;
+  virtual void setUp(VariantID vid, size_t tune_idx) = 0;
+  virtual void updateChecksum(VariantID vid, size_t tune_idx) = 0;
+  virtual void tearDown(VariantID vid, size_t tune_idx) = 0;
 
-  virtual void runSeqVariant(VariantID vid) = 0;
+  virtual void runSeqVariant(VariantID vid, size_t tune_idx) = 0;
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
-  virtual void runOpenMPVariant(VariantID vid) = 0;
+  virtual void runOpenMPVariant(VariantID vid, size_t tune_idx) = 0;
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-  virtual void runCudaVariant(VariantID vid) = 0;
+  virtual void runCudaVariant(VariantID vid, size_t tune_idx) = 0;
 #endif
 #if defined(RAJA_ENABLE_HIP)
-  virtual void runHipVariant(VariantID vid) = 0;
+  virtual void runHipVariant(VariantID vid, size_t tune_idx) = 0;
 #endif
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-  virtual void runOpenMPTargetVariant(VariantID vid) = 0;
+  virtual void runOpenMPTargetVariant(VariantID vid, size_t tune_idx) = 0;
 #endif
 
 protected:
   const RunParams& run_params;
 
-  Checksum_type checksum[NumVariants];
+  std::vector<Checksum_type> checksum[NumVariants];
   Checksum_type checksum_scale_factor;
 
 private:
@@ -180,7 +255,7 @@ class KernelBase
 
   bool uses_feature[NumFeatures];
 
-  bool has_variant_defined[NumVariants];
+  std::vector<std::string> variant_tuning_names[NumVariants];
 
   //
   // Properties of kernel dependent on how kernel is run
@@ -191,14 +266,15 @@ class KernelBase
   Index_type FLOPs_per_rep;
 
   VariantID running_variant;
+  size_t running_tuning;
 
-  int num_exec[NumVariants];
+  std::vector<int> num_exec[NumVariants];
 
   RAJA::Timer timer;
 
-  RAJA::Timer::ElapsedType min_time[NumVariants];
-  RAJA::Timer::ElapsedType max_time[NumVariants];
-  RAJA::Timer::ElapsedType tot_time[NumVariants];
+  std::vector<RAJA::Timer::ElapsedType> min_time[NumVariants];
+  std::vector<RAJA::Timer::ElapsedType> max_time[NumVariants];
+  std::vector<RAJA::Timer::ElapsedType> tot_time[NumVariants];
 };
 
 }  // closing brace for rajaperf namespace
diff --git a/src/common/OpenMPTargetDataUtils.hpp b/src/common/OpenMPTargetDataUtils.hpp
index 88fa6d759..fc36cef1f 100644
--- a/src/common/OpenMPTargetDataUtils.hpp
+++ b/src/common/OpenMPTargetDataUtils.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/common/OutputUtils.cpp b/src/common/OutputUtils.cpp
index 0696b078f..96b09c542 100644
--- a/src/common/OutputUtils.cpp
+++ b/src/common/OutputUtils.cpp
@@ -1,13 +1,18 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#include "RAJAPerfSuite.hpp"
 #include "OutputUtils.hpp"
 
+#ifdef RAJA_PERFSUITE_ENABLE_MPI
+#include <mpi.h>
+#endif
+
 #include<cstdlib>
 #include<iostream>
 #include<iomanip>
@@ -26,6 +31,16 @@ namespace rajaperf
  */
 std::string recursiveMkdir(const std::string& in_path)
 {
+#ifdef RAJA_PERFSUITE_ENABLE_MPI
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  // Processes wait for rank 0 to make the directories before proceeding
+  if (rank != 0) {
+    MPI_Barrier(MPI_COMM_WORLD);
+  }
+#endif
+
   std::string dir;
 
   std::string path = in_path;
@@ -72,7 +87,7 @@ std::string recursiveMkdir(const std::string& in_path)
    */
   if (pos >= 0) {
     if (!S_ISDIR(status.st_mode)) {
-      std::cout << "Cannot create directories in path = " << path
+      getCout() << "Cannot create directories in path = " << path
                 << "\n    because some intermediate item in path exists and"
                 << "is NOT a directory" << std::endl;
        outpath = std::string();
@@ -88,7 +103,7 @@ std::string recursiveMkdir(const std::string& in_path)
    */
   if ( !outpath.empty() && pos < 0) {
     if (mkdir(path_buf, mode) != 0) {
-      std::cout << "   Cannot create directory  = "
+      getCout() << "   Cannot create directory  = "
                 << path_buf << std::endl;
       outpath = std::string();
     }
@@ -113,7 +128,7 @@ std::string recursiveMkdir(const std::string& in_path)
       /* make directory if not at end of path */
       if (pos < length) {
         if (mkdir(path_buf, mode) != 0) {
-          std::cout << "   Cannot create directory  = "
+          getCout() << "   Cannot create directory  = "
                     << path_buf << std::endl;
           outpath = std::string();
         }
@@ -124,6 +139,13 @@ std::string recursiveMkdir(const std::string& in_path)
 
   delete[] path_buf;
 
+#ifdef RAJA_PERFSUITE_ENABLE_MPI
+  // Rank 0 lets the other processes know it made the directories
+  if (rank == 0) {
+    MPI_Barrier(MPI_COMM_WORLD);
+  }
+#endif
+
   return outpath;
 }
 
diff --git a/src/common/OutputUtils.hpp b/src/common/OutputUtils.hpp
index fc034a147..6ba77a408 100644
--- a/src/common/OutputUtils.hpp
+++ b/src/common/OutputUtils.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp
index 41fcbd5e9..1bc10c31d 100644
--- a/src/common/RAJAPerfSuite.cpp
+++ b/src/common/RAJAPerfSuite.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -10,11 +10,18 @@
 
 #include "RunParams.hpp"
 
+#ifdef RAJA_PERFSUITE_ENABLE_MPI
+#include <mpi.h>
+#endif
+
 //
 // Basic kernels...
 //
 #include "basic/DAXPY.hpp"
+#include "basic/DAXPY_ATOMIC.hpp"
 #include "basic/IF_QUAD.hpp"
+#include "basic/INDEXLIST.hpp"
+#include "basic/INDEXLIST_3LOOP.hpp"
 #include "basic/INIT3.hpp"
 #include "basic/INIT_VIEW1D.hpp"
 #include "basic/INIT_VIEW1D_OFFSET.hpp"
@@ -24,6 +31,7 @@
 #include "basic/PI_ATOMIC.hpp"
 #include "basic/PI_REDUCE.hpp"
 #include "basic/REDUCE3_INT.hpp"
+#include "basic/REDUCE_STRUCT.hpp"
 #include "basic/TRAP_INT.hpp"
 
 //
@@ -72,6 +80,7 @@
 //
 #include "apps/WIP-COUPLE.hpp"
 #include "apps/DEL_DOT_VEC_2D.hpp"
+#include "apps/DIFFUSION3DPA.hpp"
 #include "apps/ENERGY.hpp"
 #include "apps/FIR.hpp"
 #include "apps/HALOEXCHANGE.hpp"
@@ -79,14 +88,17 @@
 #include "apps/LTIMES.hpp"
 #include "apps/LTIMES_NOVIEW.hpp"
 #include "apps/MASS3DPA.hpp"
+#include "apps/NODAL_ACCUMULATION_3D.hpp"
 #include "apps/PRESSURE.hpp"
 #include "apps/VOL3D.hpp"
 
 //
 // Algorithm kernels...
 //
+#include "algorithm/SCAN.hpp"
 #include "algorithm/SORT.hpp"
 #include "algorithm/SORTPAIRS.hpp"
+#include "algorithm/REDUCE_SUM.hpp"
 
 
 #include <iostream>
@@ -139,7 +151,10 @@ static const std::string KernelNames [] =
 // Basic kernels...
 //
   std::string("Basic_DAXPY"),
+  std::string("Basic_DAXPY_ATOMIC"),
   std::string("Basic_IF_QUAD"),
+  std::string("Basic_INDEXLIST"),
+  std::string("Basic_INDEXLIST_3LOOP"),
   std::string("Basic_INIT3"),
   std::string("Basic_INIT_VIEW1D"),
   std::string("Basic_INIT_VIEW1D_OFFSET"),
@@ -149,6 +164,7 @@ static const std::string KernelNames [] =
   std::string("Basic_PI_ATOMIC"),
   std::string("Basic_PI_REDUCE"),
   std::string("Basic_REDUCE3_INT"),
+  std::string("Basic_REDUCE_STRUCT"),
   std::string("Basic_TRAP_INT"),
 
 //
@@ -197,6 +213,7 @@ static const std::string KernelNames [] =
 //
   std::string("Apps_COUPLE"),
   std::string("Apps_DEL_DOT_VEC_2D"),
+  std::string("Apps_DIFFUSION3DPA"),
   std::string("Apps_ENERGY"),
   std::string("Apps_FIR"),
   std::string("Apps_HALOEXCHANGE"),
@@ -204,14 +221,17 @@ static const std::string KernelNames [] =
   std::string("Apps_LTIMES"),
   std::string("Apps_LTIMES_NOVIEW"),
   std::string("Apps_MASS3DPA"),
+  std::string("Apps_NODAL_ACCUMULATION_3D"),
   std::string("Apps_PRESSURE"),
   std::string("Apps_VOL3D"),
 
 //
 // Algorithm kernels...
 //
+  std::string("Algorithm_SCAN"),
   std::string("Algorithm_SORT"),
   std::string("Algorithm_SORTPAIRS"),
+  std::string("Algorithm_REDUCE_SUM"),
 
   std::string("Unknown Kernel")  // Keep this at the end and DO NOT remove....
 
@@ -346,7 +366,7 @@ const std::string& getVariantName(VariantID vid)
 /*!
  *******************************************************************************
  *
- * Return true if variant associated with VariantID enum value is available 
+ * Return true if variant associated with VariantID enum value is available
  * to run; else false.
  *
  *******************************************************************************
@@ -359,22 +379,22 @@ bool isVariantAvailable(VariantID vid)
     ret_val = true;
   }
 #if defined(RUN_RAJA_SEQ)
-  if ( vid == Lambda_Seq || 
+  if ( vid == Lambda_Seq ||
        vid == RAJA_Seq ) {
     ret_val = true;
   }
 #endif
 
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
-  if ( vid == Base_OpenMP || 
-       vid == Lambda_OpenMP || 
+  if ( vid == Base_OpenMP ||
+       vid == Lambda_OpenMP ||
        vid == RAJA_OpenMP ) {
     ret_val = true;
   }
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-  if ( vid == Base_OpenMPTarget || 
+  if ( vid == Base_OpenMPTarget ||
        vid == RAJA_OpenMPTarget ) {
     ret_val = true;
   }
@@ -399,6 +419,61 @@ bool isVariantAvailable(VariantID vid)
   return ret_val;
 }
 
+/*!
+ *******************************************************************************
+ *
+ * Return true if variant associated with VariantID enum value runs on the GPU.
+ *
+ *******************************************************************************
+ */
+bool isVariantGPU(VariantID vid)
+{
+  bool ret_val = false;
+
+  if ( vid == Base_Seq ) {
+    ret_val = false;
+  }
+#if defined(RUN_RAJA_SEQ)
+  if ( vid == Lambda_Seq ||
+       vid == RAJA_Seq ) {
+    ret_val = false;
+  }
+#endif
+
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+  if ( vid == Base_OpenMP ||
+       vid == Lambda_OpenMP ||
+       vid == RAJA_OpenMP ) {
+    ret_val = false;
+  }
+#endif
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+  if ( vid == Base_OpenMPTarget ||
+       vid == RAJA_OpenMPTarget ) {
+    ret_val = false;
+  }
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+  if ( vid == Base_CUDA ||
+       vid == Lambda_CUDA ||
+       vid == RAJA_CUDA ) {
+    ret_val = true;
+  }
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+  if ( vid == Base_HIP ||
+       vid == Lambda_HIP ||
+       vid == RAJA_HIP ) {
+    ret_val = true;
+  }
+#endif
+
+  return ret_val;
+}
+
 /*
  *******************************************************************************
  *
@@ -432,10 +507,22 @@ KernelBase* getKernelObject(KernelID kid,
        kernel = new basic::DAXPY(run_params);
        break;
     }
+    case Basic_DAXPY_ATOMIC : {
+       kernel = new basic::DAXPY_ATOMIC(run_params);
+       break;
+    }
     case Basic_IF_QUAD : {
        kernel = new basic::IF_QUAD(run_params);
        break;
     }
+    case Basic_INDEXLIST : {
+       kernel = new basic::INDEXLIST(run_params);
+       break;
+    }
+    case Basic_INDEXLIST_3LOOP : {
+       kernel = new basic::INDEXLIST_3LOOP(run_params);
+       break;
+    }
     case Basic_INIT3 : {
        kernel = new basic::INIT3(run_params);
        break;
@@ -472,6 +559,10 @@ KernelBase* getKernelObject(KernelID kid,
        kernel = new basic::REDUCE3_INT(run_params);
        break;
     }
+    case Basic_REDUCE_STRUCT : { 
+        kernel = new basic::REDUCE_STRUCT(run_params);
+        break;
+    } 	
     case Basic_TRAP_INT : {
        kernel = new basic::TRAP_INT(run_params);
        break;
@@ -616,6 +707,10 @@ KernelBase* getKernelObject(KernelID kid,
        kernel = new apps::DEL_DOT_VEC_2D(run_params);
        break;
     }
+    case Apps_DIFFUSION3DPA : {
+       kernel = new apps::DIFFUSION3DPA(run_params);
+       break;
+    }
     case Apps_ENERGY : {
        kernel = new apps::ENERGY(run_params);
        break;
@@ -644,6 +739,10 @@ KernelBase* getKernelObject(KernelID kid,
        kernel = new apps::MASS3DPA(run_params);
        break;
     }
+    case Apps_NODAL_ACCUMULATION_3D : {
+       kernel = new apps::NODAL_ACCUMULATION_3D(run_params);
+       break;
+    }
     case Apps_PRESSURE : {
        kernel = new apps::PRESSURE(run_params);
        break;
@@ -656,6 +755,10 @@ KernelBase* getKernelObject(KernelID kid,
 //
 // Algorithm kernels...
 //
+    case Algorithm_SCAN: {
+       kernel = new algorithm::SCAN(run_params);
+       break;
+    }
     case Algorithm_SORT: {
        kernel = new algorithm::SORT(run_params);
        break;
@@ -664,9 +767,13 @@ KernelBase* getKernelObject(KernelID kid,
        kernel = new algorithm::SORTPAIRS(run_params);
        break;
     }
+    case Algorithm_REDUCE_SUM: {
+       kernel = new algorithm::REDUCE_SUM(run_params);
+       break;
+    }
 
     default: {
-      std::cout << "\n Unknown Kernel ID = " << kid << std::endl;
+      getCout() << "\n Unknown Kernel ID = " << kid << std::endl;
     }
 
   } // end switch on kernel id
@@ -674,4 +781,39 @@ KernelBase* getKernelObject(KernelID kid,
   return kernel;
 }
 
+// subclass of streambuf that ignores overflow
+// never printing anything to the underlying stream
+struct NullStream : std::streambuf, std::ostream
+{
+  using Base = std::streambuf;
+  using int_type = typename Base::int_type;
+
+  NullStream() : std::ostream(this) {}
+public:
+  int_type overflow(int_type c) override { return c; }
+};
+
+std::ostream* makeNullStream()
+{
+  return new NullStream();
+}
+
+std::ostream& getNullStream()
+{
+  static NullStream null_stream;
+  return null_stream;
+}
+
+std::ostream& getCout()
+{
+  int rank = 0;
+#ifdef RAJA_PERFSUITE_ENABLE_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#endif
+  if (rank == 0) {
+    return std::cout;
+  }
+  return getNullStream();
+}
+
 }  // closing brace for rajaperf namespace
diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp
index 367aeed72..fad672137 100644
--- a/src/common/RAJAPerfSuite.hpp
+++ b/src/common/RAJAPerfSuite.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -14,8 +14,10 @@
 #define RAJAPerfSuite_HPP
 
 #include "RAJA/config.hpp"
+#include "rajaperf_config.hpp"
 
 #include <string>
+#include <ostream>
 
 namespace rajaperf
 {
@@ -31,8 +33,8 @@ class RunParams;
  *
  * IMPORTANT: This is only modified when a group is added or removed.
  *
- *            ENUM VALUES MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE) 
- *            WITH ARRAY OF GROUP NAMES IN IMPLEMENTATION FILE!!! 
+ *            ENUM VALUES MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE)
+ *            WITH ARRAY OF GROUP NAMES IN IMPLEMENTATION FILE!!!
  *
  *******************************************************************************
  */
@@ -58,8 +60,8 @@ enum GroupID {
  *
  * IMPORTANT: This is only modified when a kernel is added or removed.
  *
- *            ENUM VALUES MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE) 
- *            WITH ARRAY OF KERNEL NAMES IN IMPLEMENTATION FILE!!! 
+ *            ENUM VALUES MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE)
+ *            WITH ARRAY OF KERNEL NAMES IN IMPLEMENTATION FILE!!!
  *
  *******************************************************************************
  */
@@ -69,7 +71,10 @@ enum KernelID {
 // Basic kernels...
 //
   Basic_DAXPY = 0,
+  Basic_DAXPY_ATOMIC,
   Basic_IF_QUAD,
+  Basic_INDEXLIST,
+  Basic_INDEXLIST_3LOOP,
   Basic_INIT3,
   Basic_INIT_VIEW1D,
   Basic_INIT_VIEW1D_OFFSET,
@@ -79,6 +84,7 @@ enum KernelID {
   Basic_PI_ATOMIC,
   Basic_PI_REDUCE,
   Basic_REDUCE3_INT,
+  Basic_REDUCE_STRUCT,
   Basic_TRAP_INT,
 
 //
@@ -127,6 +133,7 @@ enum KernelID {
 //
   Apps_COUPLE,
   Apps_DEL_DOT_VEC_2D,
+  Apps_DIFFUSION3DPA,
   Apps_ENERGY,
   Apps_FIR,
   Apps_HALOEXCHANGE,
@@ -134,14 +141,17 @@ enum KernelID {
   Apps_LTIMES,
   Apps_LTIMES_NOVIEW,
   Apps_MASS3DPA,
+  Apps_NODAL_ACCUMULATION_3D,
   Apps_PRESSURE,
   Apps_VOL3D,
 
 //
 // Algorithm kernels...
 //
+  Algorithm_SCAN,
   Algorithm_SORT,
   Algorithm_SORTPAIRS,
+  Algorithm_REDUCE_SUM,
 
   NumKernels // Keep this one last and NEVER comment out (!!)
 
@@ -156,7 +166,7 @@ enum KernelID {
  * IMPORTANT: This is only modified when a new variant is added to the suite.
  *
  *            IT MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE) WITH
- *            ARRAY OF VARIANT NAMES IN IMPLEMENTATION FILE!!! 
+ *            ARRAY OF VARIANT NAMES IN IMPLEMENTATION FILE!!!
  *
  *******************************************************************************
  */
@@ -206,7 +216,7 @@ enum FeatureID {
 
   Sort,
   Scan,
-  Workgroup, 
+  Workgroup,
 
   Reduction,
   Atomic,
@@ -256,18 +266,28 @@ const std::string& getFullKernelName(KernelID kid);
  *
  *******************************************************************************
  */
-const std::string& getVariantName(VariantID vid); 
+const std::string& getVariantName(VariantID vid);
 
 /*!
  *******************************************************************************
  *
- * \brief Return true if variant associated with VariantID enum value is 
+ * \brief Return true if variant associated with VariantID enum value is
  *        available * to run; else false.
  *
  *******************************************************************************
  */
 bool isVariantAvailable(VariantID vid);
 
+/*!
+ *******************************************************************************
+ *
+ * \brief Return true if variant associated with VariantID enum value runs
+ *        on the gpu.
+ *
+ *******************************************************************************
+ */
+bool isVariantGPU(VariantID vid);
+
 /*!
  *******************************************************************************
  *
@@ -288,6 +308,45 @@ const std::string& getFeatureName(FeatureID vid);
  */
 KernelBase* getKernelObject(KernelID kid, const RunParams& run_params);
 
+/*!
+ *******************************************************************************
+ *
+ * \brief Return ostream used as cout.
+ *
+ *        IMPORTANT: May return a non-printing stream when MPI is enabled.
+ *
+ *******************************************************************************
+ */
+std::ostream& getCout();
+
+/*!
+ *******************************************************************************
+ *
+ * \brief Return non-printing ostream.
+ *
+ *******************************************************************************
+ */
+std::ostream* makeNullStream();
+
+/*!
+ *******************************************************************************
+ *
+ * \brief Return reference to global non-printing ostream.
+ *
+ *******************************************************************************
+ */
+std::ostream& getNullStream();
+
+/*!
+ *******************************************************************************
+ *
+ * \brief Empty function used to squash compiler warnings for unused variables.
+ *
+ *******************************************************************************
+ */
+template < typename... Ts >
+inline void ignore_unused(Ts&&...) { }
+
 }  // closing brace for rajaperf namespace
 
 #endif  // closing endif for header file include guard
diff --git a/src/common/RPTypes.hpp b/src/common/RPTypes.hpp
index e48ba53c1..d9a2865b3 100644
--- a/src/common/RPTypes.hpp
+++ b/src/common/RPTypes.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -17,7 +17,7 @@
 
 //
 // Only one of the following (double or float) should be defined.
-// 
+//
 #define RP_USE_DOUBLE
 //#undef RP_USE_DOUBLE
 
@@ -41,7 +41,7 @@ namespace rajaperf
  *
  * \brief Type used for indexing in all kernel repetition loops.
  *
- * It is volatile to ensure that kernels will not be optimized away by 
+ * It is volatile to ensure that kernels will not be optimized away by
  * compilers, which can happen in some circumstances.
  *
  ******************************************************************************
@@ -81,6 +81,8 @@ using Int_ptr = Int_type*;
  ******************************************************************************
  */
 using Checksum_type = long double;
+///
+#define Checksum_MPI_type MPI_LONG_DOUBLE
 
 
 /*!
diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp
index 115bdea55..26cc63742 100644
--- a/src/common/RunParams.cpp
+++ b/src/common/RunParams.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -29,19 +29,29 @@ RunParams::RunParams(int argc, char** argv)
  : input_state(Undefined),
    show_progress(false),
    npasses(1),
+   npasses_combiners(),
    rep_fact(1.0),
    size_meaning(SizeMeaning::Unset),
    size(0.0),
    size_factor(0.0),
+   gpu_block_sizes(),
    pf_tol(0.1),
    checkrun_reps(1),
    reference_variant(),
    kernel_input(),
    invalid_kernel_input(),
+   exclude_kernel_input(),
+   invalid_exclude_kernel_input(),
    variant_input(),
    invalid_variant_input(),
+   exclude_variant_input(),
+   invalid_exclude_variant_input(),
    feature_input(),
    invalid_feature_input(),
+   exclude_feature_input(),
+   invalid_exclude_feature_input(),
+   npasses_combiner_input(),
+   invalid_npasses_combiner_input(),
    outdir(),
    outfile_prefix("RAJAPerf")
 {
@@ -70,19 +80,35 @@ RunParams::~RunParams()
  */
 void RunParams::print(std::ostream& str) const
 {
-  str << "\n show_progress = " << show_progress; 
-  str << "\n npasses = " << npasses; 
+  str << "\n show_progress = " << show_progress;
+  str << "\n npasses = " << npasses;
+  str << "\n npasses combiners = ";
+  for (size_t j = 0; j < npasses_combiners.size(); ++j) {
+    str << "\n\t" << CombinerOptToStr(npasses_combiners[j]);
+  }
+  str << "\n npasses_combiners_input = ";
+  for (size_t j = 0; j < npasses_combiner_input.size(); ++j) {
+    str << "\n\t" << npasses_combiner_input[j];
+  }
+  str << "\n invalid_npasses_combiners_input = ";
+  for (size_t j = 0; j < invalid_npasses_combiner_input.size(); ++j) {
+    str << "\n\t" << invalid_npasses_combiner_input[j];
+  }
   str << "\n rep_fact = " << rep_fact;
   str << "\n size_meaning = " << SizeMeaningToStr(getSizeMeaning());
   str << "\n size = " << size;
   str << "\n size_factor = " << size_factor;
-  str << "\n pf_tol = " << pf_tol; 
-  str << "\n checkrun_reps = " << checkrun_reps; 
-  str << "\n reference_variant = " << reference_variant; 
-  str << "\n outdir = " << outdir; 
-  str << "\n outfile_prefix = " << outfile_prefix; 
+  str << "\n gpu_block_sizes = ";
+  for (size_t j = 0; j < gpu_block_sizes.size(); ++j) {
+    str << "\n\t" << gpu_block_sizes[j];
+  }
+  str << "\n pf_tol = " << pf_tol;
+  str << "\n checkrun_reps = " << checkrun_reps;
+  str << "\n reference_variant = " << reference_variant;
+  str << "\n outdir = " << outdir;
+  str << "\n outfile_prefix = " << outfile_prefix;
 
-  str << "\n kernel_input = "; 
+  str << "\n kernel_input = ";
   for (size_t j = 0; j < kernel_input.size(); ++j) {
     str << "\n\t" << kernel_input[j];
   }
@@ -91,15 +117,33 @@ void RunParams::print(std::ostream& str) const
     str << "\n\t" << invalid_kernel_input[j];
   }
 
-  str << "\n variant_input = "; 
+  str << "\n exclude_kernel_input = ";
+  for (size_t j = 0; j < exclude_kernel_input.size(); ++j) {
+    str << "\n\t" << exclude_kernel_input[j];
+  }
+  str << "\n invalid_exclude_kernel_input = ";
+  for (size_t j = 0; j < invalid_exclude_kernel_input.size(); ++j) {
+    str << "\n\t" << invalid_exclude_kernel_input[j];
+  }
+
+  str << "\n variant_input = ";
   for (size_t j = 0; j < variant_input.size(); ++j) {
     str << "\n\t" << variant_input[j];
   }
-  str << "\n invalid_variant_input = "; 
+  str << "\n invalid_variant_input = ";
   for (size_t j = 0; j < invalid_variant_input.size(); ++j) {
     str << "\n\t" << invalid_variant_input[j];
   }
 
+  str << "\n exclude_variant_input = ";
+  for (size_t j = 0; j < exclude_variant_input.size(); ++j) {
+    str << "\n\t" << exclude_variant_input[j];
+  }
+  str << "\n invalid_exclude_variant_input = ";
+  for (size_t j = 0; j < invalid_exclude_variant_input.size(); ++j) {
+    str << "\n\t" << invalid_exclude_variant_input[j];
+  }
+
   str << "\n feature_input = ";
   for (size_t j = 0; j < feature_input.size(); ++j) {
     str << "\n\t" << feature_input[j];
@@ -109,6 +153,15 @@ void RunParams::print(std::ostream& str) const
     str << "\n\t" << invalid_feature_input[j];
   }
 
+  str << "\n exclude_feature_input = ";
+  for (size_t j = 0; j < exclude_feature_input.size(); ++j) {
+    str << "\n\t" << exclude_feature_input[j];
+  }
+  str << "\n invalid_exclude_feature_input = ";
+  for (size_t j = 0; j < invalid_exclude_feature_input.size(); ++j) {
+    str << "\n\t" << invalid_exclude_feature_input[j];
+  }
+
   str << std::endl;
   str.flush();
 }
@@ -123,7 +176,7 @@ void RunParams::print(std::ostream& str) const
  */
 void RunParams::parseCommandLineOptions(int argc, char** argv)
 {
-  std::cout << "\n\nReading command line input..." << std::endl;
+  getCout() << "\n\nReading command line input..." << std::endl;
 
   for (int i = 1; i < argc; ++i) {
 
@@ -132,7 +185,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
     if ( opt == std::string("--help") ||
          opt == std::string("-h") ) {
 
-      printHelpMessage(std::cout);
+      printHelpMessage(getCout());
       input_state = InfoRequest;
 
     } else if ( opt == std::string("--show-progress") ||
@@ -142,55 +195,70 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
 
     } else if ( opt == std::string("--print-kernels") ||
                 opt == std::string("-pk") ) {
-     
-      printFullKernelNames(std::cout);     
+
+      printFullKernelNames(getCout());
       input_state = InfoRequest;
- 
+
     } else if ( opt == std::string("--print-variants") ||
                 opt == std::string("-pv") ) {
 
-      printVariantNames(std::cout);     
+      printVariantNames(getCout());
       input_state = InfoRequest;
 
     } else if ( opt == std::string("--print-features") ||
                 opt == std::string("-pf") ) {
 
-      printFeatureNames(std::cout);
+      printFeatureNames(getCout());
       input_state = InfoRequest;
 
     } else if ( opt == std::string("--print-feature-kernels") ||
                 opt == std::string("-pfk") ) {
 
-      printFeatureKernels(std::cout);
+      printFeatureKernels(getCout());
       input_state = InfoRequest;
 
     } else if ( opt == std::string("--print-kernel-features") ||
                 opt == std::string("-pkf") ) {
 
-      printKernelFeatures(std::cout);
+      printKernelFeatures(getCout());
       input_state = InfoRequest;
- 
+
     } else if ( opt == std::string("--npasses") ) {
 
       i++;
-      if ( i < argc ) { 
+      if ( i < argc ) {
         npasses = ::atoi( argv[i] );
       } else {
-        std::cout << "\nBad input:"
-                  << " must give --npasses a value for number of passes (int)" 
-                  << std::endl; 
+        getCout() << "\nBad input:"
+                  << " must give --npasses a value for number of passes (int)"
+                  << std::endl;
         input_state = BadInput;
       }
 
+    } else if ( opt == std::string("--npasses-combiners") ) {
+
+      bool done = false;
+      i++;
+      while ( i < argc && !done ) {
+        opt = std::string(argv[i]);
+        if ( opt.at(0) == '-' ) {
+          i--;
+          done = true;
+        } else {
+          npasses_combiner_input.push_back(opt);
+          ++i;
+        }
+      }
+
     } else if ( opt == std::string("--repfact") ) {
 
       i++;
-      if ( i < argc ) { 
+      if ( i < argc ) {
         rep_fact = ::atof( argv[i] );
       } else {
-        std::cout << "\nBad input:"
-                  << " must give --rep_fact a value (double)" 
-                  << std::endl;       
+        getCout() << "\nBad input:"
+                  << " must give --rep_fact a value (double)"
+                  << std::endl;
         input_state = BadInput;
       }
 
@@ -199,7 +267,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
       i++;
       if ( i < argc ) {
         if (size_meaning == SizeMeaning::Direct) {
-          std::cout << "\nBad input:"
+          getCout() << "\nBad input:"
                     << " may only set one of --size and --sizefact"
                     << std::endl;
           input_state = BadInput;
@@ -208,14 +276,14 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
           if ( size_factor >= 0.0 ) {
             size_meaning = SizeMeaning::Factor;
           } else {
-            std::cout << "\nBad input:"
+            getCout() << "\nBad input:"
                   << " must give --sizefact a POSITIVE value (double)"
                   << std::endl;
             input_state = BadInput;
           }
         }
       } else {
-        std::cout << "\nBad input:"
+        getCout() << "\nBad input:"
                   << " must give --sizefact a value (double)"
                   << std::endl;
         input_state = BadInput;
@@ -226,7 +294,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
       i++;
       if ( i < argc ) {
         if (size_meaning == SizeMeaning::Factor) {
-          std::cout << "\nBad input:"
+          getCout() << "\nBad input:"
                     << " may only set one of --size and --sizefact"
                     << std::endl;
           input_state = BadInput;
@@ -235,19 +303,50 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
           if ( size >= 0.0 ) {
             size_meaning = SizeMeaning::Direct;
           } else {
-            std::cout << "\nBad input:"
+            getCout() << "\nBad input:"
                   << " must give --size a POSITIVE value (double)"
                   << std::endl;
             input_state = BadInput;
           }
         }
       } else {
-        std::cout << "\nBad input:"
+        getCout() << "\nBad input:"
                   << " must give --size a value (int)"
                   << std::endl;
         input_state = BadInput;
       }
 
+    } else if ( opt == std::string("--gpu_block_size") ) {
+
+      bool got_someting = false;
+      bool done = false;
+      i++;
+      while ( i < argc && !done ) {
+        opt = std::string(argv[i]);
+        if ( opt.at(0) == '-' ) {
+          i--;
+          done = true;
+        } else {
+          got_someting = true;
+          int gpu_block_size = ::atoi( opt.c_str() );
+          if ( gpu_block_size <= 0 ) {
+            std::cout << "\nBad input:"
+                      << " must give --gpu_block_size POSITIVE values (int)"
+                      << std::endl;
+            input_state = BadInput;
+          } else {
+            gpu_block_sizes.push_back(gpu_block_size);
+          }
+          ++i;
+        }
+      }
+      if (!got_someting) {
+        std::cout << "\nBad input:"
+                  << " must give --gpu_block_size one or more values (int)"
+                  << std::endl;
+        input_state = BadInput;
+      }
+
     } else if ( opt == std::string("--pass-fail-tol") ||
                 opt == std::string("-pftol") ) {
 
@@ -255,7 +354,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
       if ( i < argc ) {
         pf_tol = ::atof( argv[i] );
       } else {
-        std::cout << "\nBad input:"
+        getCout() << "\nBad input:"
                   << " must give --pass-fail-tol (or -pftol) a value (double)"
                   << std::endl;
         input_state = BadInput;
@@ -277,6 +376,22 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
         }
       }
 
+    } else if ( opt == std::string("--exclude-kernels") ||
+                opt == std::string("-ek") ) {
+
+      bool done = false;
+      i++;
+      while ( i < argc && !done ) {
+        opt = std::string(argv[i]);
+        if ( opt.at(0) == '-' ) {
+          i--;
+          done = true;
+        } else {
+          exclude_kernel_input.push_back(opt);
+          ++i;
+        }
+      }
+
     } else if ( std::string(argv[i]) == std::string("--variants") ||
                 std::string(argv[i]) == std::string("-v") ) {
 
@@ -293,6 +408,22 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
         }
       }
 
+    } else if ( std::string(argv[i]) == std::string("--exclude-variants") ||
+                std::string(argv[i]) == std::string("-ev") ) {
+
+      bool done = false;
+      i++;
+      while ( i < argc && !done ) {
+        opt = std::string(argv[i]);
+        if ( opt.at(0) == '-' ) {
+          i--;
+          done = true;
+        } else {
+          exclude_variant_input.push_back(opt);
+          ++i;
+        }
+      }
+
     } else if ( std::string(argv[i]) == std::string("--features") ||
                 std::string(argv[i]) == std::string("-f") ) {
 
@@ -309,6 +440,22 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
         }
       }
 
+    } else if ( std::string(argv[i]) == std::string("--exclude-features") ||
+                std::string(argv[i]) == std::string("-ef") ) {
+
+      bool done = false;
+      i++;
+      while ( i < argc && !done ) {
+        opt = std::string(argv[i]);
+        if ( opt.at(0) == '-' ) {
+          i--;
+          done = true;
+        } else {
+          exclude_feature_input.push_back(opt);
+          ++i;
+        }
+      }
+
     } else if ( std::string(argv[i]) == std::string("--outdir") ||
                 std::string(argv[i]) == std::string("-od") ) {
 
@@ -353,10 +500,10 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
        if (input_state != BadInput) {
          input_state = DryRun;
        }
-   
+
     } else if ( std::string(argv[i]) == std::string("--checkrun") ) {
 
-      input_state = CheckRun; 
+      input_state = CheckRun;
 
       i++;
       if ( i < argc ) {
@@ -370,12 +517,12 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
       }
 
     } else {
-     
+
       input_state = BadInput;
 
-      std::string huh(argv[i]);   
-      std::cout << "\nUnknown option: " << huh << std::endl;
-      std::cout.flush();
+      std::string huh(argv[i]);
+      getCout() << "\nUnknown option: " << huh << std::endl;
+      getCout().flush();
 
     }
 
@@ -386,13 +533,18 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
     size_meaning = SizeMeaning::Factor;
     size_factor = 1.0;
   }
+
+  // Default npasses_combiners if no input
+  if (npasses_combiner_input.empty()) {
+    npasses_combiners.emplace_back(CombinerOpt::Average);
+  }
 }
 
 
 void RunParams::printHelpMessage(std::ostream& str) const
 {
   str << "\nUsage: ./raja-perf.exe [options]\n";
-  str << "Valid options are:\n"; 
+  str << "Valid options are:\n";
 
   str << "\t --help, -h (print options with descriptions)\n\n";
 
@@ -411,9 +563,15 @@ void RunParams::printHelpMessage(std::ostream& str) const
       << "\t      (print names of features used by each kernel)\n\n";
 
   str << "\t --npasses <int> [default is 1]\n"
-      << "\t      (num passes through Suite)\n"; 
+      << "\t      (num passes through Suite)\n";
   str << "\t\t Example...\n"
-      << "\t\t --npasses 2 (runs complete Suite twice\n\n";
+      << "\t\t --npasses 2 (runs complete Suite twice)\n\n";
+
+  str << "\t --npasses-combiners <space-separated strings> [Default is average]\n"
+      << "\t      (Ways of combining npasses timing data into timing files)\n";
+  str << "\t\t Example...\n"
+      << "\t\t --npasses-combiners Average Minimum Maximum (produce average, min, and\n"
+      << "\t\t   max timing .csv files)\n\n";
 
   str << "\t --repfact <double> [default is 1.0]\n"
       << "\t      (multiplier on default # reps to run each kernel)\n";
@@ -432,30 +590,56 @@ void RunParams::printHelpMessage(std::ostream& str) const
   str << "\t\t Example...\n"
       << "\t\t --size 1000000 (runs kernels with size ~1,000,000)\n\n";
 
+  str << "\t --gpu_block_size <space-separated ints> [no default]\n"
+      << "\t      (block sizes to run for all GPU kernels)\n"
+      << "\t      (GPU kernels not supporting gpu_block_size will be skipped)\n"
+      << "\t      (Support is determined by kernel implementation and cmake variable RAJA_PERFSUITE_GPU_BLOCKSIZES)\n";
+  str << "\t\t Example...\n"
+      << "\t\t --gpu_block_size 128 256 512 (runs kernels with gpu_block_size 128, 256, and 512)\n\n";
+
   str << "\t --pass-fail-tol, -pftol <double> [default is 0.1; i.e., 10%]\n"
       << "\t      (slowdown tolerance for RAJA vs. Base variants in FOM report)\n";
   str << "\t\t Example...\n"
       << "\t\t -pftol 0.2 (RAJA kernel variants that run 20% or more slower than Base variants will be reported as OVER_TOL in FOM report)\n\n";
 
   str << "\t --kernels, -k <space-separated strings> [Default is run all]\n"
-      << "\t      (names of individual kernels and/or groups of kernels to run)\n"; 
+      << "\t      (names of individual kernels and/or groups of kernels to run)\n";
   str << "\t\t Examples...\n"
       << "\t\t --kernels Polybench (run all kernels in Polybench group)\n"
       << "\t\t -k INIT3 MULADDSUB (run INIT3 and MULADDSUB kernels)\n"
-      << "\t\t -k INIT3 Apps (run INIT3 kernsl and all kernels in Apps group)\n\n";
+      << "\t\t -k INIT3 Apps (run INIT3 kernel and all kernels in Apps group)\n\n";
+
+  str << "\t --exclude-kernels, -ek <space-separated strings> [Default is exclude none]\n"
+      << "\t      (names of individual kernels and/or groups of kernels to exclude)\n";
+  str << "\t\t Examples...\n"
+      << "\t\t --exclude-kernels Polybench (exclude all kernels in Polybench group)\n"
+      << "\t\t -ek INIT3 MULADDSUB (exclude INIT3 and MULADDSUB kernels)\n"
+      << "\t\t -ek INIT3 Apps (exclude INIT3 kernel and all kernels in Apps group)\n\n";
 
   str << "\t --variants, -v <space-separated strings> [Default is run all]\n"
-      << "\t      (names of variants to run)\n"; 
+      << "\t      (names of variants to run)\n";
   str << "\t\t Examples...\n"
       << "\t\t --variants RAJA_CUDA (run all RAJA_CUDA kernel variants)\n"
       << "\t\t -v Base_Seq RAJA_CUDA (run Base_Seq and  RAJA_CUDA variants)\n\n";
 
+  str << "\t --exclude-variants, -ev <space-separated strings> [Default is exclude none]\n"
+      << "\t      (names of variants to exclude)\n";
+  str << "\t\t Examples...\n"
+      << "\t\t --exclude-variants RAJA_CUDA (exclude all RAJA_CUDA kernel variants)\n"
+      << "\t\t -ev Base_Seq RAJA_CUDA (exclude Base_Seq and  RAJA_CUDA variants)\n\n";
+
   str << "\t --features, -f <space-separated strings> [Default is run all]\n"
       << "\t      (names of features to run)\n";
   str << "\t\t Examples...\n"
       << "\t\t --features Forall (run all kernels that use RAJA forall)\n"
       << "\t\t -f Forall Reduction (run all kernels that use RAJA forall or RAJA reductions)\n\n";
 
+  str << "\t --exclude-features, -ef <space-separated strings> [Default is exclude none]\n"
+      << "\t      (names of features to exclude)\n";
+  str << "\t\t Examples...\n"
+      << "\t\t --exclude-features Forall (exclude all kernels that use RAJA forall)\n"
+      << "\t\t -ef Forall Reduction (exclude all kernels that use RAJA forall or RAJA reductions)\n\n";
+
   str << "\t --outdir, -od <string> [Default is current directory]\n"
       << "\t      (directory path for output data files)\n";
   str << "\t\t Examples...\n"
@@ -476,7 +660,7 @@ void RunParams::printHelpMessage(std::ostream& str) const
   str << "\t --dryrun (print summary of how Suite will run without running it)\n\n";
 
   str << "\t --checkrun <int> [default is 1]\n"
-<< "\t      (run each kernel a given number of times; usually to check things are working properly or to reduce aggregate execution time)\n"; 
+<< "\t      (run each kernel a given number of times; usually to check things are working properly or to reduce aggregate execution time)\n";
   str << "\t\t Example...\n"
       << "\t\t --checkrun 2 (run each kernel twice)\n\n";
 
@@ -572,7 +756,7 @@ void RunParams::printKernelFeatures(std::ostream& str) const
   str << "\nAvailable kernels and features each uses:";
   str << "\n-----------------------------------------\n";
   for (int kid = 0; kid < NumKernels; ++kid) {
-    KernelID tkid = static_cast<KernelID>(kid); 
+    KernelID tkid = static_cast<KernelID>(kid);
 /// RDH DISABLE COUPLE KERNEL
     if (tkid != Apps_COUPLE) {
       str << getFullKernelName(tkid) << std::endl;
@@ -584,7 +768,7 @@ void RunParams::printKernelFeatures(std::ostream& str) const
         }
       }  // loop over features
       delete kern;
-    }  
+    }
   }  // loop over kernels
   str.flush();
 }
diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp
index d9437bf9e..d0e7d81bf 100644
--- a/src/common/RunParams.hpp
+++ b/src/common/RunParams.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -37,14 +37,37 @@ class RunParams {
   enum InputOpt {
     InfoRequest,  /*!< option requesting information */
     DryRun,       /*!< report summary of how suite will run w/o running */
-    CheckRun,     /*!< run suite with small rep count to make sure 
+    CheckRun,     /*!< run suite with small rep count to make sure
                        everything works properly */
-    PerfRun,      /*!< input defines a valid performance run, 
+    PerfRun,      /*!< input defines a valid performance run,
                        suite will run as specified */
-    BadInput,     /*!< erroneous input given */ 
+    BadInput,     /*!< erroneous input given */
     Undefined     /*!< input not defined (yet) */
   };
 
+  /*!
+   * \brief Enumeration indicating state of combiner options requested
+   */
+  enum CombinerOpt {
+    Average,      /*!< option requesting average */
+    Minimum,      /*!< option requesting minimum */
+    Maximum       /*!< option requesting maximum */
+  };
+
+  static std::string CombinerOptToStr(CombinerOpt co)
+  {
+    switch (co) {
+      case CombinerOpt::Average:
+        return "Average";
+      case CombinerOpt::Minimum:
+        return "Minimum";
+      case CombinerOpt::Maximum:
+        return "Maximum";
+      default:
+        return "Unknown";
+    }
+  }
+
   /*!
    * \brief Enumeration indicating how to interpret size input
    */
@@ -71,7 +94,7 @@ class RunParams {
 //@{
 //! @name Methods to get/set input state
 
-  InputOpt getInputState() const { return input_state; } 
+  InputOpt getInputState() const { return input_state; }
 
   /*!
    * \brief Set whether run parameters (from input) are valid.
@@ -90,6 +113,11 @@ class RunParams {
 
   double getRepFactor() const { return rep_fact; }
 
+  const std::vector<CombinerOpt>& getNpassesCombinerOpts() const
+  { return npasses_combiners; }
+  void setNpassesCombinerOpts( std::vector<CombinerOpt>& cvec )
+  { npasses_combiners = cvec; }
+
 
   SizeMeaning getSizeMeaning() const { return size_meaning; }
 
@@ -97,26 +125,51 @@ class RunParams {
 
   double getSizeFactor() const { return size_factor; }
 
+  size_t numValidGPUBlockSize() const { return gpu_block_sizes.size(); }
+  bool validGPUBlockSize(size_t block_size) const
+  {
+    for (size_t valid_block_size : gpu_block_sizes) {
+      if (valid_block_size == block_size) {
+        return true;
+      }
+    }
+    return false;
+  }
+
   double getPFTolerance() const { return pf_tol; }
 
   int getCheckRunReps() const { return checkrun_reps; }
 
   const std::string& getReferenceVariant() const { return reference_variant; }
 
-  const std::vector<std::string>& getKernelInput() const 
+  const std::vector<std::string>& getKernelInput() const
                                   { return kernel_input; }
   void setInvalidKernelInput( std::vector<std::string>& svec )
                               { invalid_kernel_input = svec; }
   const std::vector<std::string>& getInvalidKernelInput() const
                                   { return invalid_kernel_input; }
 
-  const std::vector<std::string>& getVariantInput() const 
+  const std::vector<std::string>& getExcludeKernelInput() const
+                                  { return exclude_kernel_input; }
+  void setInvalidExcludeKernelInput( std::vector<std::string>& svec )
+                              { invalid_exclude_kernel_input = svec; }
+  const std::vector<std::string>& getInvalidExcludeKernelInput() const
+                                  { return invalid_exclude_kernel_input; }
+
+  const std::vector<std::string>& getVariantInput() const
                                   { return variant_input; }
   void setInvalidVariantInput( std::vector<std::string>& svec )
                                { invalid_variant_input = svec; }
   const std::vector<std::string>& getInvalidVariantInput() const
                                   { return invalid_variant_input; }
 
+  const std::vector<std::string>& getExcludeVariantInput() const
+                                  { return exclude_variant_input; }
+  void setInvalidExcludeVariantInput( std::vector<std::string>& svec )
+                               { invalid_exclude_variant_input = svec; }
+  const std::vector<std::string>& getInvalidExcludeVariantInput() const
+                                  { return invalid_exclude_variant_input; }
+
   const std::vector<std::string>& getFeatureInput() const
                                   { return feature_input; }
   void setInvalidFeatureInput( std::vector<std::string>& svec )
@@ -124,6 +177,20 @@ class RunParams {
   const std::vector<std::string>& getInvalidFeatureInput() const
                                   { return invalid_feature_input; }
 
+  const std::vector<std::string>& getExcludeFeatureInput() const
+                                  { return exclude_feature_input; }
+  void setInvalidExcludeFeatureInput( std::vector<std::string>& svec )
+                               { invalid_exclude_feature_input = svec; }
+  const std::vector<std::string>& getInvalidExcludeFeatureInput() const
+                                  { return invalid_exclude_feature_input; }
+
+  const std::vector<std::string>& getNpassesCombinerOptInput() const
+                                  { return npasses_combiner_input; }
+  const std::vector<std::string>& getInvalidNpassesCombinerOptInput() const
+                                  { return invalid_npasses_combiner_input; }
+  void setInvalidNpassesCombinerOptInput( std::vector<std::string>& svec )
+                              { invalid_npasses_combiner_input = svec; }
+
   const std::string& getOutputDirName() const { return outdir; }
   const std::string& getOutputFilePrefix() const { return outfile_prefix; }
 
@@ -157,11 +224,15 @@ class RunParams {
 
   int npasses;           /*!< Number of passes through suite  */
 
+  std::vector<CombinerOpt> npasses_combiners;  /*!< Combiners to use when
+                              outputting timer data */
+
   double rep_fact;       /*!< pct of default kernel reps to run */
 
   SizeMeaning size_meaning; /*!< meaning of size value */
   double size;           /*!< kernel size to run (input option) */
   double size_factor;    /*!< default kernel size multipier (input option) */
+  std::vector<size_t> gpu_block_sizes; /*!< Block sizes for gpu tunings to run (input option) */
 
   double pf_tol;         /*!< pct RAJA variant run time can exceed base for
                               each PM case to pass/fail acceptance */
@@ -169,18 +240,27 @@ class RunParams {
   int checkrun_reps;     /*!< Num reps each kernel is run in check run */
 
   std::string reference_variant;   /*!< Name of reference variant for speedup
-                                        calculations */ 
+                                        calculations */
 
   //
-  // Arrays to hold input strings for valid/invalid input. Helpful for  
+  // Arrays to hold input strings for valid/invalid input. Helpful for
   // debugging command line args.
   //
   std::vector<std::string> kernel_input;
   std::vector<std::string> invalid_kernel_input;
+  std::vector<std::string> exclude_kernel_input;
+  std::vector<std::string> invalid_exclude_kernel_input;
   std::vector<std::string> variant_input;
   std::vector<std::string> invalid_variant_input;
+  std::vector<std::string> exclude_variant_input;
+  std::vector<std::string> invalid_exclude_variant_input;
   std::vector<std::string> feature_input;
   std::vector<std::string> invalid_feature_input;
+  std::vector<std::string> exclude_feature_input;
+  std::vector<std::string> invalid_exclude_feature_input;
+
+  std::vector<std::string> npasses_combiner_input;
+  std::vector<std::string> invalid_npasses_combiner_input;
 
   std::string outdir;          /*!< Output directory name. */
   std::string outfile_prefix;  /*!< Prefix for output data file names. */
diff --git a/src/lcals/CMakeLists.txt b/src/lcals/CMakeLists.txt
index 90cc8bc9a..5f88c8c69 100644
--- a/src/lcals/CMakeLists.txt
+++ b/src/lcals/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
diff --git a/src/lcals/DIFF_PREDICT-Cuda.cpp b/src/lcals/DIFF_PREDICT-Cuda.cpp
index 7be0908b4..0ef286507 100644
--- a/src/lcals/DIFF_PREDICT-Cuda.cpp
+++ b/src/lcals/DIFF_PREDICT-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace lcals
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define DIFF_PREDICT_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(px, m_px, m_array_length); \
   allocAndInitCudaDeviceData(cx, m_cx, m_array_length);
@@ -36,18 +30,21 @@ namespace lcals
   deallocCudaDeviceData(px); \
   deallocCudaDeviceData(cx);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void diff_predict(Real_ptr px, Real_ptr cx,
                              const Index_type offset,
                              Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      DIFF_PREDICT_BODY;
    }
 }
 
 
-void DIFF_PREDICT::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void DIFF_PREDICT::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -63,7 +60,7 @@ void DIFF_PREDICT::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       diff_predict<<<grid_size, block_size>>>( px, cx,
+       diff_predict<block_size><<<grid_size, block_size>>>( px, cx,
                                                 offset,
                                                 iend );
        cudaErrchk( cudaGetLastError() );
@@ -91,10 +88,12 @@ void DIFF_PREDICT::runCudaVariant(VariantID vid)
     DIFF_PREDICT_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  DIFF_PREDICT : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  DIFF_PREDICT : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DIFF_PREDICT, Cuda)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/DIFF_PREDICT-Hip.cpp b/src/lcals/DIFF_PREDICT-Hip.cpp
index 8bc38e983..4f076157d 100644
--- a/src/lcals/DIFF_PREDICT-Hip.cpp
+++ b/src/lcals/DIFF_PREDICT-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace lcals
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define DIFF_PREDICT_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(px, m_px, m_array_length); \
   allocAndInitHipDeviceData(cx, m_cx, m_array_length);
@@ -36,18 +30,21 @@ namespace lcals
   deallocHipDeviceData(px); \
   deallocHipDeviceData(cx);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void diff_predict(Real_ptr px, Real_ptr cx,
                              const Index_type offset,
                              Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      DIFF_PREDICT_BODY;
    }
 }
 
 
-void DIFF_PREDICT::runHipVariant(VariantID vid)
+template < size_t block_size >
+void DIFF_PREDICT::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -63,7 +60,7 @@ void DIFF_PREDICT::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       hipLaunchKernelGGL((diff_predict), dim3(grid_size), dim3(block_size), 0, 0,  px, cx,
+       hipLaunchKernelGGL((diff_predict<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  px, cx,
                                                 offset,
                                                 iend );
        hipErrchk( hipGetLastError() );
@@ -91,10 +88,12 @@ void DIFF_PREDICT::runHipVariant(VariantID vid)
     DIFF_PREDICT_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  DIFF_PREDICT : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  DIFF_PREDICT : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DIFF_PREDICT, Hip)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/DIFF_PREDICT-OMP.cpp b/src/lcals/DIFF_PREDICT-OMP.cpp
index 0b50b2bb4..e83c208bd 100644
--- a/src/lcals/DIFF_PREDICT-OMP.cpp
+++ b/src/lcals/DIFF_PREDICT-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void DIFF_PREDICT::runOpenMPVariant(VariantID vid)
+void DIFF_PREDICT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -81,12 +81,12 @@ void DIFF_PREDICT::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  DIFF_PREDICT : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  DIFF_PREDICT : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/lcals/DIFF_PREDICT-OMPTarget.cpp b/src/lcals/DIFF_PREDICT-OMPTarget.cpp
index dadb14c81..44e78452f 100644
--- a/src/lcals/DIFF_PREDICT-OMPTarget.cpp
+++ b/src/lcals/DIFF_PREDICT-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
@@ -39,7 +39,7 @@ namespace lcals
   deallocOpenMPDeviceData(cx, did);
 
 
-void DIFF_PREDICT::runOpenMPTargetVariant(VariantID vid)
+void DIFF_PREDICT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -55,7 +55,7 @@ void DIFF_PREDICT::runOpenMPTargetVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       #pragma omp target is_device_ptr(px, cx) device( did )
-      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) 
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
       for (Index_type i = ibegin; i < iend; ++i ) {
         DIFF_PREDICT_BODY;
       }
@@ -83,7 +83,7 @@ void DIFF_PREDICT::runOpenMPTargetVariant(VariantID vid)
     DIFF_PREDICT_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  DIFF_PREDICT : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  DIFF_PREDICT : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/lcals/DIFF_PREDICT-Seq.cpp b/src/lcals/DIFF_PREDICT-Seq.cpp
index 7329386eb..bff82a6eb 100644
--- a/src/lcals/DIFF_PREDICT-Seq.cpp
+++ b/src/lcals/DIFF_PREDICT-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void DIFF_PREDICT::runSeqVariant(VariantID vid)
+void DIFF_PREDICT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -62,7 +62,7 @@ void DIFF_PREDICT::runSeqVariant(VariantID vid)
 
       break;
     }
- 
+
     case RAJA_Seq : {
 
       startTimer();
@@ -79,7 +79,7 @@ void DIFF_PREDICT::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  DIFF_PREDICT : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  DIFF_PREDICT : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp
index d1a96a101..338ba7d0d 100644
--- a/src/lcals/DIFF_PREDICT.cpp
+++ b/src/lcals/DIFF_PREDICT.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -55,7 +55,7 @@ DIFF_PREDICT::~DIFF_PREDICT()
 {
 }
 
-void DIFF_PREDICT::setUp(VariantID vid)
+void DIFF_PREDICT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   m_array_length = getActualProblemSize() * 14;
   m_offset = getActualProblemSize();
@@ -64,12 +64,12 @@ void DIFF_PREDICT::setUp(VariantID vid)
   allocAndInitData(m_cx, m_array_length, vid);
 }
 
-void DIFF_PREDICT::updateChecksum(VariantID vid)
+void DIFF_PREDICT::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_px, m_array_length);
+  checksum[vid][tune_idx] += calcChecksum(m_px, m_array_length);
 }
 
-void DIFF_PREDICT::tearDown(VariantID vid)
+void DIFF_PREDICT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_px);
diff --git a/src/lcals/DIFF_PREDICT.hpp b/src/lcals/DIFF_PREDICT.hpp
index 504dd8bd7..130071412 100644
--- a/src/lcals/DIFF_PREDICT.hpp
+++ b/src/lcals/DIFF_PREDICT.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -84,17 +84,27 @@ class DIFF_PREDICT : public KernelBase
 
   ~DIFF_PREDICT();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_px;
   Real_ptr m_cx;
 
diff --git a/src/lcals/EOS-Cuda.cpp b/src/lcals/EOS-Cuda.cpp
index c66a99545..f99828d46 100644
--- a/src/lcals/EOS-Cuda.cpp
+++ b/src/lcals/EOS-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace lcals
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define EOS_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(x, m_x, m_array_length); \
   allocAndInitCudaDeviceData(y, m_y, m_array_length); \
@@ -40,18 +34,21 @@ namespace lcals
   deallocCudaDeviceData(z); \
   deallocCudaDeviceData(u);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void eos(Real_ptr x, Real_ptr y, Real_ptr z, Real_ptr u,
                     Real_type q, Real_type r, Real_type t,
                     Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      EOS_BODY;
    }
 }
 
 
-void EOS::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void EOS::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -67,7 +64,7 @@ void EOS::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       eos<<<grid_size, block_size>>>( x, y, z, u,
+       eos<block_size><<<grid_size, block_size>>>( x, y, z, u,
                                        q, r, t,
                                        iend );
        cudaErrchk( cudaGetLastError() );
@@ -95,10 +92,12 @@ void EOS::runCudaVariant(VariantID vid)
     EOS_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  EOS : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  EOS : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(EOS, Cuda)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/EOS-Hip.cpp b/src/lcals/EOS-Hip.cpp
index 53f952a25..0912ce5ce 100644
--- a/src/lcals/EOS-Hip.cpp
+++ b/src/lcals/EOS-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace lcals
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define EOS_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(x, m_x, m_array_length); \
   allocAndInitHipDeviceData(y, m_y, m_array_length); \
@@ -40,18 +34,21 @@ namespace lcals
   deallocHipDeviceData(z); \
   deallocHipDeviceData(u);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void eos(Real_ptr x, Real_ptr y, Real_ptr z, Real_ptr u,
                     Real_type q, Real_type r, Real_type t,
                     Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      EOS_BODY;
    }
 }
 
 
-void EOS::runHipVariant(VariantID vid)
+template < size_t block_size >
+void EOS::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -67,7 +64,7 @@ void EOS::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       hipLaunchKernelGGL((eos), dim3(grid_size), dim3(block_size), 0, 0,  x, y, z, u,
+       hipLaunchKernelGGL((eos<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  x, y, z, u,
                                        q, r, t,
                                        iend );
        hipErrchk( hipGetLastError() );
@@ -95,10 +92,12 @@ void EOS::runHipVariant(VariantID vid)
     EOS_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  EOS : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  EOS : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(EOS, Hip)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/EOS-OMP.cpp b/src/lcals/EOS-OMP.cpp
index 9654eef46..4a9688f03 100644
--- a/src/lcals/EOS-OMP.cpp
+++ b/src/lcals/EOS-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void EOS::runOpenMPVariant(VariantID vid)
+void EOS::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -81,12 +81,12 @@ void EOS::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  EOS : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  EOS : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/lcals/EOS-OMPTarget.cpp b/src/lcals/EOS-OMPTarget.cpp
index 189746801..6cc2f832b 100644
--- a/src/lcals/EOS-OMPTarget.cpp
+++ b/src/lcals/EOS-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
@@ -43,7 +43,7 @@ namespace lcals
   deallocOpenMPDeviceData(u, did);
 
 
-void EOS::runOpenMPTargetVariant(VariantID vid)
+void EOS::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -59,7 +59,7 @@ void EOS::runOpenMPTargetVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       #pragma omp target is_device_ptr(x, y, z, u) device( did )
-      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) 
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
       for (Index_type i = ibegin; i < iend; ++i ) {
         EOS_BODY;
       }
@@ -79,15 +79,15 @@ void EOS::runOpenMPTargetVariant(VariantID vid)
       RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
         RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
         EOS_BODY;
-      });  
+      });
 
     }
     stopTimer();
 
     EOS_DATA_TEARDOWN_OMP_TARGET
 
-  } else { 
-     std::cout << "\n  EOS : Unknown OMP Tagretvariant id = " << vid << std::endl;
+  } else {
+     getCout() << "\n  EOS : Unknown OMP Tagretvariant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/lcals/EOS-Seq.cpp b/src/lcals/EOS-Seq.cpp
index a33b776cc..3aaeabdde 100644
--- a/src/lcals/EOS-Seq.cpp
+++ b/src/lcals/EOS-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void EOS::runSeqVariant(VariantID vid)
+void EOS::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -79,7 +79,7 @@ void EOS::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  EOS : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  EOS : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp
index 4a8671172..27bc43d06 100644
--- a/src/lcals/EOS.cpp
+++ b/src/lcals/EOS.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -63,7 +63,7 @@ EOS::~EOS()
 {
 }
 
-void EOS::setUp(VariantID vid)
+void EOS::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_x, m_array_length, 0.0, vid);
   allocAndInitData(m_y, m_array_length, vid);
@@ -75,12 +75,12 @@ void EOS::setUp(VariantID vid)
   initData(m_t, vid);
 }
 
-void EOS::updateChecksum(VariantID vid)
+void EOS::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor );
 }
 
-void EOS::tearDown(VariantID vid)
+void EOS::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_x);
diff --git a/src/lcals/EOS.hpp b/src/lcals/EOS.hpp
index 82a779ac2..f2d38b5e9 100644
--- a/src/lcals/EOS.hpp
+++ b/src/lcals/EOS.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -53,17 +53,27 @@ class EOS : public KernelBase
 
   ~EOS();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_x;
   Real_ptr m_y;
   Real_ptr m_z;
diff --git a/src/lcals/FIRST_DIFF-Cuda.cpp b/src/lcals/FIRST_DIFF-Cuda.cpp
index f8330fdfc..b195c0d46 100644
--- a/src/lcals/FIRST_DIFF-Cuda.cpp
+++ b/src/lcals/FIRST_DIFF-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace lcals
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define FIRST_DIFF_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(x, m_x, m_N); \
   allocAndInitCudaDeviceData(y, m_y, m_N);
@@ -36,17 +30,20 @@ namespace lcals
   deallocCudaDeviceData(x); \
   deallocCudaDeviceData(y);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void first_diff(Real_ptr x, Real_ptr y,
                            Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      FIRST_DIFF_BODY;
    }
 }
 
 
-void FIRST_DIFF::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void FIRST_DIFF::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -62,7 +59,7 @@ void FIRST_DIFF::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       first_diff<<<grid_size, block_size>>>( x, y,
+       first_diff<block_size><<<grid_size, block_size>>>( x, y,
                                               iend );
        cudaErrchk( cudaGetLastError() );
 
@@ -89,10 +86,12 @@ void FIRST_DIFF::runCudaVariant(VariantID vid)
     FIRST_DIFF_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  FIRST_DIFF : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  FIRST_DIFF : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(FIRST_DIFF, Cuda)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/FIRST_DIFF-Hip.cpp b/src/lcals/FIRST_DIFF-Hip.cpp
index 4ac557fec..382fa107e 100644
--- a/src/lcals/FIRST_DIFF-Hip.cpp
+++ b/src/lcals/FIRST_DIFF-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace lcals
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define FIRST_DIFF_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(x, m_x, m_N); \
   allocAndInitHipDeviceData(y, m_y, m_N);
@@ -36,17 +30,20 @@ namespace lcals
   deallocHipDeviceData(x); \
   deallocHipDeviceData(y);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void first_diff(Real_ptr x, Real_ptr y,
                            Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      FIRST_DIFF_BODY;
    }
 }
 
 
-void FIRST_DIFF::runHipVariant(VariantID vid)
+template < size_t block_size >
+void FIRST_DIFF::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -62,7 +59,7 @@ void FIRST_DIFF::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       hipLaunchKernelGGL((first_diff), dim3(grid_size), dim3(block_size), 0, 0,  x, y,
+       hipLaunchKernelGGL((first_diff<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  x, y,
                                               iend );
        hipErrchk( hipGetLastError() );
 
@@ -89,10 +86,12 @@ void FIRST_DIFF::runHipVariant(VariantID vid)
     FIRST_DIFF_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  FIRST_DIFF : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  FIRST_DIFF : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(FIRST_DIFF, Hip)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/FIRST_DIFF-OMP.cpp b/src/lcals/FIRST_DIFF-OMP.cpp
index 1c6287d78..73c945f3a 100644
--- a/src/lcals/FIRST_DIFF-OMP.cpp
+++ b/src/lcals/FIRST_DIFF-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void FIRST_DIFF::runOpenMPVariant(VariantID vid)
+void FIRST_DIFF::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -81,12 +81,12 @@ void FIRST_DIFF::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  FIRST_DIFF : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  FIRST_DIFF : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/lcals/FIRST_DIFF-OMPTarget.cpp b/src/lcals/FIRST_DIFF-OMPTarget.cpp
index 85e022f1c..13c9a9888 100644
--- a/src/lcals/FIRST_DIFF-OMPTarget.cpp
+++ b/src/lcals/FIRST_DIFF-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
@@ -39,24 +39,24 @@ namespace lcals
   deallocOpenMPDeviceData(y, did);
 
 
-void FIRST_DIFF::runOpenMPTargetVariant(VariantID vid)
+void FIRST_DIFF::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
 
   FIRST_DIFF_DATA_SETUP;
- 
+
   if ( vid == Base_OpenMPTarget ) {
 
     FIRST_DIFF_DATA_SETUP_OMP_TARGET;
-                       
+
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       #pragma omp target is_device_ptr(x, y) device( did )
-      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) 
-        
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
+
       for (Index_type i = ibegin; i < iend; ++i ) {
         FIRST_DIFF_BODY;
       }
@@ -65,11 +65,11 @@ void FIRST_DIFF::runOpenMPTargetVariant(VariantID vid)
     stopTimer();
 
     FIRST_DIFF_DATA_TEARDOWN_OMP_TARGET;
-                       
+
   } else if ( vid == RAJA_OpenMPTarget ) {
 
     FIRST_DIFF_DATA_SETUP_OMP_TARGET;
-                       
+
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
@@ -82,9 +82,9 @@ void FIRST_DIFF::runOpenMPTargetVariant(VariantID vid)
     stopTimer();
 
     FIRST_DIFF_DATA_TEARDOWN_OMP_TARGET;
-                       
-  } else {                          
-     std::cout << "\n  FIRST_DIFF : Unknown OMP Target variant id = " << vid << std::endl;
+
+  } else {
+     getCout() << "\n  FIRST_DIFF : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/lcals/FIRST_DIFF-Seq.cpp b/src/lcals/FIRST_DIFF-Seq.cpp
index 3b6cefb9a..41837ff90 100644
--- a/src/lcals/FIRST_DIFF-Seq.cpp
+++ b/src/lcals/FIRST_DIFF-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void FIRST_DIFF::runSeqVariant(VariantID vid)
+void FIRST_DIFF::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -79,7 +79,7 @@ void FIRST_DIFF::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  FIRST_DIFF : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  FIRST_DIFF : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp
index c37c41aac..9272b20d4 100644
--- a/src/lcals/FIRST_DIFF.cpp
+++ b/src/lcals/FIRST_DIFF.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -59,18 +59,18 @@ FIRST_DIFF::~FIRST_DIFF()
 {
 }
 
-void FIRST_DIFF::setUp(VariantID vid)
+void FIRST_DIFF::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_x, m_N, 0.0, vid);
   allocAndInitData(m_y, m_N, vid);
 }
 
-void FIRST_DIFF::updateChecksum(VariantID vid)
+void FIRST_DIFF::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_x, getActualProblemSize());
+  checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize());
 }
 
-void FIRST_DIFF::tearDown(VariantID vid)
+void FIRST_DIFF::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_x);
diff --git a/src/lcals/FIRST_DIFF.hpp b/src/lcals/FIRST_DIFF.hpp
index 21c279b89..51de73049 100644
--- a/src/lcals/FIRST_DIFF.hpp
+++ b/src/lcals/FIRST_DIFF.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -43,17 +43,27 @@ class FIRST_DIFF : public KernelBase
 
   ~FIRST_DIFF();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_x;
   Real_ptr m_y;
 
diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp
index ef87159f2..f98982860 100644
--- a/src/lcals/FIRST_MIN-Cuda.cpp
+++ b/src/lcals/FIRST_MIN-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,35 +21,31 @@ namespace rajaperf
 namespace lcals
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define FIRST_MIN_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(x, m_x, m_N);
 
 #define FIRST_MIN_DATA_TEARDOWN_CUDA \
   deallocCudaDeviceData(x);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void first_min(Real_ptr x,
                           MyMinLoc* dminloc,
                           Index_type iend)
 {
   extern __shared__ MyMinLoc minloc[ ];
 
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
 
   minloc[ threadIdx.x ] = *dminloc;
 
-  for ( ; i < iend ; i += gridDim.x * blockDim.x ) {
+  for ( ; i < iend ; i += gridDim.x * block_size ) {
     MyMinLoc& mymin = minloc[ threadIdx.x ];
     FIRST_MIN_BODY;
   }
   __syncthreads();
 
-  for ( i = blockDim.x / 2; i > 0; i /= 2 ) {
+  for ( i = block_size / 2; i > 0; i /= 2 ) {
     if ( threadIdx.x < i ) {
       if ( minloc[ threadIdx.x + i].val < minloc[ threadIdx.x ].val ) {
         minloc[ threadIdx.x ] = minloc[ threadIdx.x + i];
@@ -66,7 +62,8 @@ __global__ void first_min(Real_ptr x,
 }
 
 
-void FIRST_MIN::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void FIRST_MIN::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -89,7 +86,7 @@ void FIRST_MIN::runCudaVariant(VariantID vid)
                                cudaMemcpyHostToDevice ) );
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       first_min<<<grid_size, block_size,
+       first_min<block_size><<<grid_size, block_size,
                    sizeof(MyMinLoc)*block_size>>>( x,
                                                    dminloc,
                                                    iend );
@@ -129,11 +126,12 @@ void FIRST_MIN::runCudaVariant(VariantID vid)
     FIRST_MIN_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  FIRST_MIN : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  FIRST_MIN : Unknown Cuda variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(FIRST_MIN, Cuda)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp
index 9880927e6..e2b2763cf 100644
--- a/src/lcals/FIRST_MIN-Hip.cpp
+++ b/src/lcals/FIRST_MIN-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,35 +21,31 @@ namespace rajaperf
 namespace lcals
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define FIRST_MIN_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(x, m_x, m_N);
 
 #define FIRST_MIN_DATA_TEARDOWN_HIP \
   deallocHipDeviceData(x);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void first_min(Real_ptr x,
                           MyMinLoc* dminloc,
                           Index_type iend)
 {
   extern __shared__ MyMinLoc minloc[ ];
 
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
 
   minloc[ threadIdx.x ] = *dminloc;
 
-  for ( ; i < iend ; i += gridDim.x * blockDim.x ) {
+  for ( ; i < iend ; i += gridDim.x * block_size ) {
     MyMinLoc& mymin = minloc[ threadIdx.x ];
     FIRST_MIN_BODY;
   }
   __syncthreads();
 
-  for ( i = blockDim.x / 2; i > 0; i /= 2 ) {
+  for ( i = block_size / 2; i > 0; i /= 2 ) {
     if ( threadIdx.x < i ) {
       if ( minloc[ threadIdx.x + i].val < minloc[ threadIdx.x ].val ) {
         minloc[ threadIdx.x ] = minloc[ threadIdx.x + i];
@@ -66,7 +62,8 @@ __global__ void first_min(Real_ptr x,
 }
 
 
-void FIRST_MIN::runHipVariant(VariantID vid)
+template < size_t block_size >
+void FIRST_MIN::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -89,7 +86,7 @@ void FIRST_MIN::runHipVariant(VariantID vid)
                                hipMemcpyHostToDevice ) );
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       hipLaunchKernelGGL(first_min, grid_size, block_size,
+       hipLaunchKernelGGL((first_min<block_size>), grid_size, block_size,
                    sizeof(MyMinLoc)*block_size, 0, x,
                                                    dminloc,
                                                    iend );
@@ -129,11 +126,12 @@ void FIRST_MIN::runHipVariant(VariantID vid)
     FIRST_MIN_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  FIRST_MIN : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  FIRST_MIN : Unknown Hip variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(FIRST_MIN, Hip)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/FIRST_MIN-OMP.cpp b/src/lcals/FIRST_MIN-OMP.cpp
index 85e2f7db0..ef7791739 100644
--- a/src/lcals/FIRST_MIN-OMP.cpp
+++ b/src/lcals/FIRST_MIN-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,14 +12,14 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 FIRST_MIN_MINLOC_COMPARE;
 
-void FIRST_MIN::runOpenMPVariant(VariantID vid)
+void FIRST_MIN::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -43,7 +43,7 @@ void FIRST_MIN::runOpenMPVariant(VariantID vid)
 
         #pragma omp parallel for reduction(minloc:mymin)
         for (Index_type i = ibegin; i < iend; ++i ) {
-          FIRST_MIN_BODY; 
+          FIRST_MIN_BODY;
         }
 
         m_minloc = RAJA_MAX(m_minloc, mymin.loc);
@@ -97,7 +97,7 @@ void FIRST_MIN::runOpenMPVariant(VariantID vid)
           FIRST_MIN_BODY_RAJA;
         });
 
-        m_minloc = RAJA_MAX(m_minloc, loc.getLoc()); 
+        m_minloc = RAJA_MAX(m_minloc, loc.getLoc());
 
       }
       stopTimer();
@@ -106,12 +106,12 @@ void FIRST_MIN::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  FIRST_MIN : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  FIRST_MIN : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/lcals/FIRST_MIN-OMPTarget.cpp b/src/lcals/FIRST_MIN-OMPTarget.cpp
index 41ac5f225..52472c588 100644
--- a/src/lcals/FIRST_MIN-OMPTarget.cpp
+++ b/src/lcals/FIRST_MIN-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
@@ -37,18 +37,18 @@ namespace lcals
 
 FIRST_MIN_MINLOC_COMPARE;
 
-void FIRST_MIN::runOpenMPTargetVariant(VariantID vid)
+void FIRST_MIN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
 
   FIRST_MIN_DATA_SETUP;
- 
+
   if ( vid == Base_OpenMPTarget ) {
 
     FIRST_MIN_DATA_SETUP_OMP_TARGET;
-                       
+
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
@@ -70,11 +70,11 @@ void FIRST_MIN::runOpenMPTargetVariant(VariantID vid)
     stopTimer();
 
     FIRST_MIN_DATA_TEARDOWN_OMP_TARGET;
-                       
+
   } else if ( vid == RAJA_OpenMPTarget ) {
 
     FIRST_MIN_DATA_SETUP_OMP_TARGET;
-                       
+
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
@@ -92,9 +92,9 @@ void FIRST_MIN::runOpenMPTargetVariant(VariantID vid)
     stopTimer();
 
     FIRST_MIN_DATA_TEARDOWN_OMP_TARGET;
-                       
-  } else {                          
-     std::cout << "\n  FIRST_MIN : Unknown OMP Target variant id = " << vid << std::endl;
+
+  } else {
+     getCout() << "\n  FIRST_MIN : Unknown OMP Target variant id = " << vid << std::endl;
   }
 
 }
diff --git a/src/lcals/FIRST_MIN-Seq.cpp b/src/lcals/FIRST_MIN-Seq.cpp
index 560b78c34..7bb311675 100644
--- a/src/lcals/FIRST_MIN-Seq.cpp
+++ b/src/lcals/FIRST_MIN-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void FIRST_MIN::runSeqVariant(VariantID vid)
+void FIRST_MIN::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -97,7 +97,7 @@ void FIRST_MIN::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  FIRST_MIN : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  FIRST_MIN : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp
index c6138e46a..8fe9a8c93 100644
--- a/src/lcals/FIRST_MIN.cpp
+++ b/src/lcals/FIRST_MIN.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -63,7 +63,7 @@ FIRST_MIN::~FIRST_MIN()
 {
 }
 
-void FIRST_MIN::setUp(VariantID vid)
+void FIRST_MIN::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_x, m_N, 0.0, vid);
   m_x[ m_N / 2 ] = -1.0e+10;
@@ -72,12 +72,12 @@ void FIRST_MIN::setUp(VariantID vid)
   m_minloc = -1;
 }
 
-void FIRST_MIN::updateChecksum(VariantID vid)
+void FIRST_MIN::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += static_cast<long double>(m_minloc);
+  checksum[vid][tune_idx] += static_cast<long double>(m_minloc);
 }
 
-void FIRST_MIN::tearDown(VariantID vid)
+void FIRST_MIN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_x);
diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp
index a9b48c1b3..c10839ec7 100644
--- a/src/lcals/FIRST_MIN.hpp
+++ b/src/lcals/FIRST_MIN.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -72,17 +72,27 @@ class FIRST_MIN : public KernelBase
 
   ~FIRST_MIN();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_x;
   Real_type m_xmin_init;
   Index_type m_initloc;
diff --git a/src/lcals/FIRST_SUM-Cuda.cpp b/src/lcals/FIRST_SUM-Cuda.cpp
index 55d0e2214..85db3d39d 100644
--- a/src/lcals/FIRST_SUM-Cuda.cpp
+++ b/src/lcals/FIRST_SUM-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace lcals
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define FIRST_SUM_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(x, m_x, m_N); \
   allocAndInitCudaDeviceData(y, m_y, m_N);
@@ -36,17 +30,20 @@ namespace lcals
   deallocCudaDeviceData(x); \
   deallocCudaDeviceData(y);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void first_sum(Real_ptr x, Real_ptr y,
                           Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i > 0 && i < iend) {
      FIRST_SUM_BODY;
    }
 }
 
 
-void FIRST_SUM::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void FIRST_SUM::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 1;
@@ -62,7 +59,7 @@ void FIRST_SUM::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       first_sum<<<grid_size, block_size>>>( x, y,
+       first_sum<block_size><<<grid_size, block_size>>>( x, y,
                                               iend );
        cudaErrchk( cudaGetLastError() );
 
@@ -89,10 +86,12 @@ void FIRST_SUM::runCudaVariant(VariantID vid)
     FIRST_SUM_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  FIRST_SUM : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  FIRST_SUM : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(FIRST_SUM, Cuda)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/FIRST_SUM-Hip.cpp b/src/lcals/FIRST_SUM-Hip.cpp
index 0f2cb2ede..1a03619e5 100644
--- a/src/lcals/FIRST_SUM-Hip.cpp
+++ b/src/lcals/FIRST_SUM-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace lcals
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define FIRST_SUM_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(x, m_x, m_N); \
   allocAndInitHipDeviceData(y, m_y, m_N);
@@ -36,17 +30,20 @@ namespace lcals
   deallocHipDeviceData(x); \
   deallocHipDeviceData(y);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void first_sum(Real_ptr x, Real_ptr y,
                           Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i > 0 && i < iend) {
      FIRST_SUM_BODY;
    }
 }
 
 
-void FIRST_SUM::runHipVariant(VariantID vid)
+template < size_t block_size >
+void FIRST_SUM::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 1;
@@ -62,7 +59,7 @@ void FIRST_SUM::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       hipLaunchKernelGGL(first_sum,grid_size, block_size, 0, 0, x, y,
+       hipLaunchKernelGGL((first_sum<block_size>),grid_size, block_size, 0, 0, x, y,
                                               iend );
        hipErrchk( hipGetLastError() );
 
@@ -89,10 +86,12 @@ void FIRST_SUM::runHipVariant(VariantID vid)
     FIRST_SUM_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  FIRST_SUM : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  FIRST_SUM : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(FIRST_SUM, Hip)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/FIRST_SUM-OMP.cpp b/src/lcals/FIRST_SUM-OMP.cpp
index 41e15c1b6..58d1a1070 100644
--- a/src/lcals/FIRST_SUM-OMP.cpp
+++ b/src/lcals/FIRST_SUM-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void FIRST_SUM::runOpenMPVariant(VariantID vid)
+void FIRST_SUM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -81,12 +81,12 @@ void FIRST_SUM::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  FIRST_SUM : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  FIRST_SUM : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/lcals/FIRST_SUM-OMPTarget.cpp b/src/lcals/FIRST_SUM-OMPTarget.cpp
index 24c183e3c..afc53dd6c 100644
--- a/src/lcals/FIRST_SUM-OMPTarget.cpp
+++ b/src/lcals/FIRST_SUM-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
@@ -39,24 +39,24 @@ namespace lcals
   deallocOpenMPDeviceData(y, did);
 
 
-void FIRST_SUM::runOpenMPTargetVariant(VariantID vid)
+void FIRST_SUM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 1;
   const Index_type iend = getActualProblemSize();
 
   FIRST_SUM_DATA_SETUP;
- 
+
   if ( vid == Base_OpenMPTarget ) {
 
     FIRST_SUM_DATA_SETUP_OMP_TARGET;
-                       
+
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       #pragma omp target is_device_ptr(x, y) device( did )
-      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) 
-        
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
+
       for (Index_type i = ibegin; i < iend; ++i ) {
         FIRST_SUM_BODY;
       }
@@ -65,11 +65,11 @@ void FIRST_SUM::runOpenMPTargetVariant(VariantID vid)
     stopTimer();
 
     FIRST_SUM_DATA_TEARDOWN_OMP_TARGET;
-                       
+
   } else if ( vid == RAJA_OpenMPTarget ) {
 
     FIRST_SUM_DATA_SETUP_OMP_TARGET;
-                       
+
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
@@ -82,9 +82,9 @@ void FIRST_SUM::runOpenMPTargetVariant(VariantID vid)
     stopTimer();
 
     FIRST_SUM_DATA_TEARDOWN_OMP_TARGET;
-                       
-  } else {                          
-     std::cout << "\n  FIRST_SUM : Unknown OMP Target variant id = " << vid << std::endl;
+
+  } else {
+     getCout() << "\n  FIRST_SUM : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/lcals/FIRST_SUM-Seq.cpp b/src/lcals/FIRST_SUM-Seq.cpp
index 60bb5756d..29417f4c1 100644
--- a/src/lcals/FIRST_SUM-Seq.cpp
+++ b/src/lcals/FIRST_SUM-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void FIRST_SUM::runSeqVariant(VariantID vid)
+void FIRST_SUM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 1;
@@ -79,7 +79,7 @@ void FIRST_SUM::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  FIRST_SUM : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  FIRST_SUM : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp
index ceaa9bc8b..a9d135446 100644
--- a/src/lcals/FIRST_SUM.cpp
+++ b/src/lcals/FIRST_SUM.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -58,18 +58,18 @@ FIRST_SUM::~FIRST_SUM()
 {
 }
 
-void FIRST_SUM::setUp(VariantID vid)
+void FIRST_SUM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_x, m_N, 0.0, vid);
   allocAndInitData(m_y, m_N, vid);
 }
 
-void FIRST_SUM::updateChecksum(VariantID vid)
+void FIRST_SUM::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_x, getActualProblemSize());
+  checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize());
 }
 
-void FIRST_SUM::tearDown(VariantID vid)
+void FIRST_SUM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_x);
diff --git a/src/lcals/FIRST_SUM.hpp b/src/lcals/FIRST_SUM.hpp
index d828ac896..5f019c08c 100644
--- a/src/lcals/FIRST_SUM.hpp
+++ b/src/lcals/FIRST_SUM.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -46,17 +46,27 @@ class FIRST_SUM : public KernelBase
 
   ~FIRST_SUM();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_x;
   Real_ptr m_y;
 
diff --git a/src/lcals/GEN_LIN_RECUR-Cuda.cpp b/src/lcals/GEN_LIN_RECUR-Cuda.cpp
index 53793af75..76f840294 100644
--- a/src/lcals/GEN_LIN_RECUR-Cuda.cpp
+++ b/src/lcals/GEN_LIN_RECUR-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace lcals
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define GEN_LIN_RECUR_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(b5, m_b5, m_N); \
   allocAndInitCudaDeviceData(stb5, m_stb5, m_N); \
@@ -40,30 +34,35 @@ namespace lcals
   deallocCudaDeviceData(sa); \
   deallocCudaDeviceData(sb);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void genlinrecur1(Real_ptr b5, Real_ptr stb5,
                              Real_ptr sa, Real_ptr sb,
                              Index_type kb5i,
                              Index_type N)
 {
-   Index_type k = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type k = blockIdx.x * block_size + threadIdx.x;
    if (k < N) {
      GEN_LIN_RECUR_BODY1;
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void genlinrecur2(Real_ptr b5, Real_ptr stb5,
                              Real_ptr sa, Real_ptr sb,
                              Index_type kb5i,
                              Index_type N)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i > 0 && i < N+1) {
      GEN_LIN_RECUR_BODY2;
    }
 }
 
 
-void GEN_LIN_RECUR::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void GEN_LIN_RECUR::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -77,13 +76,13 @@ void GEN_LIN_RECUR::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
        const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(N, block_size);
-       genlinrecur1<<<grid_size1, block_size>>>( b5, stb5, sa, sb,
+       genlinrecur1<block_size><<<grid_size1, block_size>>>( b5, stb5, sa, sb,
                                                  kb5i,
                                                  N );
        cudaErrchk( cudaGetLastError() );
 
        const size_t grid_size2 = RAJA_DIVIDE_CEILING_INT(N+1, block_size);
-       genlinrecur1<<<grid_size2, block_size>>>( b5, stb5, sa, sb,
+       genlinrecur2<block_size><<<grid_size2, block_size>>>( b5, stb5, sa, sb,
                                                  kb5i,
                                                  N );
        cudaErrchk( cudaGetLastError() );
@@ -116,10 +115,12 @@ void GEN_LIN_RECUR::runCudaVariant(VariantID vid)
     GEN_LIN_RECUR_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  GEN_LIN_RECUR : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  GEN_LIN_RECUR : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(GEN_LIN_RECUR, Cuda)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/GEN_LIN_RECUR-Hip.cpp b/src/lcals/GEN_LIN_RECUR-Hip.cpp
index 7d96b27f4..65fef4e8b 100644
--- a/src/lcals/GEN_LIN_RECUR-Hip.cpp
+++ b/src/lcals/GEN_LIN_RECUR-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace lcals
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define GEN_LIN_RECUR_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(b5, m_b5, m_N); \
   allocAndInitHipDeviceData(stb5, m_stb5, m_N); \
@@ -40,30 +34,35 @@ namespace lcals
   deallocHipDeviceData(sa); \
   deallocHipDeviceData(sb);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void genlinrecur1(Real_ptr b5, Real_ptr stb5,
                              Real_ptr sa, Real_ptr sb,
                              Index_type kb5i,
                              Index_type N)
 {
-   Index_type k = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type k = blockIdx.x * block_size + threadIdx.x;
    if (k < N) {
      GEN_LIN_RECUR_BODY1;
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void genlinrecur2(Real_ptr b5, Real_ptr stb5,
                              Real_ptr sa, Real_ptr sb,
                              Index_type kb5i,
                              Index_type N)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i > 0 && i < N+1) {
      GEN_LIN_RECUR_BODY2;
    }
 }
 
 
-void GEN_LIN_RECUR::runHipVariant(VariantID vid)
+template < size_t block_size >
+void GEN_LIN_RECUR::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -77,14 +76,14 @@ void GEN_LIN_RECUR::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
        const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(N, block_size);
-       hipLaunchKernelGGL(genlinrecur1, grid_size1, block_size, 0, 0,
+       hipLaunchKernelGGL((genlinrecur1<block_size>), grid_size1, block_size, 0, 0,
                                                  b5, stb5, sa, sb,
                                                  kb5i,
                                                  N );
        hipErrchk( hipGetLastError() );
 
        const size_t grid_size2 = RAJA_DIVIDE_CEILING_INT(N+1, block_size);
-       hipLaunchKernelGGL(genlinrecur1, grid_size2, block_size, 0, 0,
+       hipLaunchKernelGGL((genlinrecur2<block_size>), grid_size2, block_size, 0, 0,
                                                  b5, stb5, sa, sb,
                                                  kb5i,
                                                  N );
@@ -118,10 +117,12 @@ void GEN_LIN_RECUR::runHipVariant(VariantID vid)
     GEN_LIN_RECUR_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  GEN_LIN_RECUR : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  GEN_LIN_RECUR : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(GEN_LIN_RECUR, Hip)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/GEN_LIN_RECUR-OMP.cpp b/src/lcals/GEN_LIN_RECUR-OMP.cpp
index c70bdef44..3d40a9e47 100644
--- a/src/lcals/GEN_LIN_RECUR-OMP.cpp
+++ b/src/lcals/GEN_LIN_RECUR-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void GEN_LIN_RECUR::runOpenMPVariant(VariantID vid)
+void GEN_LIN_RECUR::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -95,12 +95,12 @@ void GEN_LIN_RECUR::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  GEN_LIN_RECUR : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  GEN_LIN_RECUR : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp b/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp
index e72163afb..1949698fd 100644
--- a/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp
+++ b/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
@@ -43,7 +43,7 @@ namespace lcals
   deallocOpenMPDeviceData(sb, did);
 
 
-void GEN_LIN_RECUR::runOpenMPTargetVariant(VariantID vid)
+void GEN_LIN_RECUR::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -57,7 +57,7 @@ void GEN_LIN_RECUR::runOpenMPTargetVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       #pragma omp target is_device_ptr(b5, stb5, sa, sb) device( did )
-      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) 
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
       for (Index_type k = 0; k < N; ++k ) {
         GEN_LIN_RECUR_BODY1;
       }
@@ -95,8 +95,8 @@ void GEN_LIN_RECUR::runOpenMPTargetVariant(VariantID vid)
 
     GEN_LIN_RECUR_DATA_TEARDOWN_OMP_TARGET
 
-  } else { 
-     std::cout << "\n  GEN_LIN_RECUR : Unknown OMP Tagretvariant id = " << vid << std::endl;
+  } else {
+     getCout() << "\n  GEN_LIN_RECUR : Unknown OMP Tagretvariant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/lcals/GEN_LIN_RECUR-Seq.cpp b/src/lcals/GEN_LIN_RECUR-Seq.cpp
index e6f2233b3..efde12463 100644
--- a/src/lcals/GEN_LIN_RECUR-Seq.cpp
+++ b/src/lcals/GEN_LIN_RECUR-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void GEN_LIN_RECUR::runSeqVariant(VariantID vid)
+void GEN_LIN_RECUR::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -91,7 +91,7 @@ void GEN_LIN_RECUR::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  GEN_LIN_RECUR : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  GEN_LIN_RECUR : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp
index 6534633da..b0598aa8e 100644
--- a/src/lcals/GEN_LIN_RECUR.cpp
+++ b/src/lcals/GEN_LIN_RECUR.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -63,7 +63,7 @@ GEN_LIN_RECUR::~GEN_LIN_RECUR()
 {
 }
 
-void GEN_LIN_RECUR::setUp(VariantID vid)
+void GEN_LIN_RECUR::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   m_kb5i = 0;
 
@@ -73,12 +73,12 @@ void GEN_LIN_RECUR::setUp(VariantID vid)
   allocAndInitData(m_sb, m_N, vid);
 }
 
-void GEN_LIN_RECUR::updateChecksum(VariantID vid)
+void GEN_LIN_RECUR::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_b5, getActualProblemSize(), checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_b5, getActualProblemSize(), checksum_scale_factor );
 }
 
-void GEN_LIN_RECUR::tearDown(VariantID vid)
+void GEN_LIN_RECUR::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_b5);
diff --git a/src/lcals/GEN_LIN_RECUR.hpp b/src/lcals/GEN_LIN_RECUR.hpp
index 3fa49e69f..d6d20b43b 100644
--- a/src/lcals/GEN_LIN_RECUR.hpp
+++ b/src/lcals/GEN_LIN_RECUR.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -67,17 +67,27 @@ class GEN_LIN_RECUR : public KernelBase
 
   ~GEN_LIN_RECUR();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_b5;
   Real_ptr m_sa;
   Real_ptr m_sb;
diff --git a/src/lcals/HYDRO_1D-Cuda.cpp b/src/lcals/HYDRO_1D-Cuda.cpp
index 74f102f5f..901ca786b 100644
--- a/src/lcals/HYDRO_1D-Cuda.cpp
+++ b/src/lcals/HYDRO_1D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace lcals
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define HYDRO_1D_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(x, m_x, m_array_length); \
   allocAndInitCudaDeviceData(y, m_y, m_array_length); \
@@ -38,18 +32,21 @@ namespace lcals
   deallocCudaDeviceData(y); \
   deallocCudaDeviceData(z); \
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void hydro_1d(Real_ptr x, Real_ptr y, Real_ptr z,
                          Real_type q, Real_type r, Real_type t,
                          Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      HYDRO_1D_BODY;
    }
 }
 
 
-void HYDRO_1D::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void HYDRO_1D::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -65,7 +62,7 @@ void HYDRO_1D::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       hydro_1d<<<grid_size, block_size>>>( x, y, z,
+       hydro_1d<block_size><<<grid_size, block_size>>>( x, y, z,
                                             q, r, t,
                                             iend );
        cudaErrchk( cudaGetLastError() );
@@ -93,10 +90,12 @@ void HYDRO_1D::runCudaVariant(VariantID vid)
     HYDRO_1D_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  HYDRO_1D : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  HYDRO_1D : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(HYDRO_1D, Cuda)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/HYDRO_1D-Hip.cpp b/src/lcals/HYDRO_1D-Hip.cpp
index 6c06b2de0..d39ec0f7e 100644
--- a/src/lcals/HYDRO_1D-Hip.cpp
+++ b/src/lcals/HYDRO_1D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace lcals
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define HYDRO_1D_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(x, m_x, m_array_length); \
   allocAndInitHipDeviceData(y, m_y, m_array_length); \
@@ -38,18 +32,21 @@ namespace lcals
   deallocHipDeviceData(y); \
   deallocHipDeviceData(z); \
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void hydro_1d(Real_ptr x, Real_ptr y, Real_ptr z,
                          Real_type q, Real_type r, Real_type t,
                          Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      HYDRO_1D_BODY;
    }
 }
 
 
-void HYDRO_1D::runHipVariant(VariantID vid)
+template < size_t block_size >
+void HYDRO_1D::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -65,7 +62,7 @@ void HYDRO_1D::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       hipLaunchKernelGGL((hydro_1d), dim3(grid_size), dim3(block_size), 0, 0,  x, y, z,
+       hipLaunchKernelGGL((hydro_1d<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  x, y, z,
                                             q, r, t,
                                             iend );
        hipErrchk( hipGetLastError() );
@@ -93,10 +90,12 @@ void HYDRO_1D::runHipVariant(VariantID vid)
     HYDRO_1D_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  HYDRO_1D : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  HYDRO_1D : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(HYDRO_1D, Hip)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/HYDRO_1D-OMP.cpp b/src/lcals/HYDRO_1D-OMP.cpp
index f6e0e2277..29ea4db01 100644
--- a/src/lcals/HYDRO_1D-OMP.cpp
+++ b/src/lcals/HYDRO_1D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void HYDRO_1D::runOpenMPVariant(VariantID vid)
+void HYDRO_1D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -82,12 +82,12 @@ void HYDRO_1D::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  HYDRO_1D : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  HYDRO_1D : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/lcals/HYDRO_1D-OMPTarget.cpp b/src/lcals/HYDRO_1D-OMPTarget.cpp
index 9adafba0a..d154b473f 100644
--- a/src/lcals/HYDRO_1D-OMPTarget.cpp
+++ b/src/lcals/HYDRO_1D-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
@@ -41,7 +41,7 @@ namespace lcals
   deallocOpenMPDeviceData(z, did); \
 
 
-void HYDRO_1D::runOpenMPTargetVariant(VariantID vid)
+void HYDRO_1D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -57,7 +57,7 @@ void HYDRO_1D::runOpenMPTargetVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       #pragma omp target is_device_ptr(x, y, z) device( did )
-      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) 
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
       for (Index_type i = ibegin; i < iend; ++i ) {
         HYDRO_1D_BODY;
       }
@@ -85,7 +85,7 @@ void HYDRO_1D::runOpenMPTargetVariant(VariantID vid)
     HYDRO_1D_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  HYDRO_1D : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  HYDRO_1D : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/lcals/HYDRO_1D-Seq.cpp b/src/lcals/HYDRO_1D-Seq.cpp
index cdf086ffe..2833cf6bc 100644
--- a/src/lcals/HYDRO_1D-Seq.cpp
+++ b/src/lcals/HYDRO_1D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void HYDRO_1D::runSeqVariant(VariantID vid)
+void HYDRO_1D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -79,7 +79,7 @@ void HYDRO_1D::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  HYDRO_1D : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  HYDRO_1D : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp
index 08198ca0f..5ce1d0700 100644
--- a/src/lcals/HYDRO_1D.cpp
+++ b/src/lcals/HYDRO_1D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -62,7 +62,7 @@ HYDRO_1D::~HYDRO_1D()
 {
 }
 
-void HYDRO_1D::setUp(VariantID vid)
+void HYDRO_1D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_x, m_array_length, 0.0, vid);
   allocAndInitData(m_y, m_array_length, vid);
@@ -73,12 +73,12 @@ void HYDRO_1D::setUp(VariantID vid)
   initData(m_t, vid);
 }
 
-void HYDRO_1D::updateChecksum(VariantID vid)
+void HYDRO_1D::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor );
 }
 
-void HYDRO_1D::tearDown(VariantID vid)
+void HYDRO_1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_x);
diff --git a/src/lcals/HYDRO_1D.hpp b/src/lcals/HYDRO_1D.hpp
index 029065be8..692e40a8e 100644
--- a/src/lcals/HYDRO_1D.hpp
+++ b/src/lcals/HYDRO_1D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -48,17 +48,27 @@ class HYDRO_1D : public KernelBase
 
   ~HYDRO_1D();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_x;
   Real_ptr m_y;
   Real_ptr m_z;
diff --git a/src/lcals/HYDRO_2D-Cuda.cpp b/src/lcals/HYDRO_2D-Cuda.cpp
index 2c0087358..21c320a60 100644
--- a/src/lcals/HYDRO_2D-Cuda.cpp
+++ b/src/lcals/HYDRO_2D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,13 +22,17 @@ namespace lcals
 {
 
   //
-  // Define thread block size for CUDA execution
+  // Define thread block shape for CUDA execution
   //
-  constexpr size_t j_block_sz = 32;
-  constexpr size_t k_block_sz = 8;
+#define j_block_sz (32)
+#define k_block_sz (block_size / j_block_sz)
+
+#define HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \
+  j_block_sz, k_block_sz
 
 #define HYDRO_2D_THREADS_PER_BLOCK_CUDA \
-  dim3 nthreads_per_block(j_block_sz, k_block_sz, 1);
+  dim3 nthreads_per_block(HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA, 1); \
+  static_assert(j_block_sz*k_block_sz == block_size, "Invalid block_size");
 
 #define HYDRO_2D_NBLOCKS_CUDA \
   dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(jn-2, j_block_sz)), \
@@ -66,41 +70,47 @@ namespace lcals
   deallocCudaDeviceData(zroutdat); \
   deallocCudaDeviceData(zzoutdat);
 
+template < size_t j_block_size, size_t k_block_size >
+__launch_bounds__(j_block_size*k_block_size)
 __global__ void hydro_2d1(Real_ptr zadat, Real_ptr zbdat,
                           Real_ptr zpdat, Real_ptr zqdat,
                           Real_ptr zrdat, Real_ptr zmdat,
                           Index_type jn, Index_type kn)
 {
-   Index_type k = 1 + blockIdx.y * blockDim.y + threadIdx.y;
-   Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type k = 1 + blockIdx.y * k_block_size + threadIdx.y;
+   Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x;
 
    if (k < kn-1 && j < jn-1) {
      HYDRO_2D_BODY1;
    }
 }
 
+template < size_t j_block_size, size_t k_block_size >
+__launch_bounds__(j_block_size*k_block_size)
 __global__ void hydro_2d2(Real_ptr zudat, Real_ptr zvdat,
                           Real_ptr zadat, Real_ptr zbdat,
                           Real_ptr zzdat, Real_ptr zrdat,
                           Real_type s,
                           Index_type jn, Index_type kn)
 {
-   Index_type k = 1 + blockIdx.y * blockDim.y + threadIdx.y;
-   Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type k = 1 + blockIdx.y * k_block_size + threadIdx.y;
+   Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x;
 
    if (k < kn-1 && j < jn-1) {
      HYDRO_2D_BODY2;
    }
 }
 
+template < size_t j_block_size, size_t k_block_size >
+__launch_bounds__(j_block_size*k_block_size)
 __global__ void hydro_2d3(Real_ptr zroutdat, Real_ptr zzoutdat,
                           Real_ptr zrdat, Real_ptr zudat,
                           Real_ptr zzdat, Real_ptr zvdat,
                           Real_type t,
                           Index_type jn, Index_type kn)
 {
-   Index_type k = 1 + blockIdx.y * blockDim.y + threadIdx.y;
-   Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type k = 1 + blockIdx.y * k_block_size + threadIdx.y;
+   Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x;
 
    if (k < kn-1 && j < jn-1) {
      HYDRO_2D_BODY3;
@@ -108,7 +118,8 @@ __global__ void hydro_2d3(Real_ptr zroutdat, Real_ptr zzoutdat,
 }
 
 
-void HYDRO_2D::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void HYDRO_2D::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type kbeg = 1;
@@ -127,19 +138,22 @@ void HYDRO_2D::runCudaVariant(VariantID vid)
 
       HYDRO_2D_THREADS_PER_BLOCK_CUDA;
       HYDRO_2D_NBLOCKS_CUDA;
- 
-      hydro_2d1<<<nblocks, nthreads_per_block>>>(zadat, zbdat,
+
+      hydro_2d1<HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+               <<<nblocks, nthreads_per_block>>>(zadat, zbdat,
                                                  zpdat, zqdat, zrdat, zmdat,
                                                  jn, kn);
       cudaErrchk( cudaGetLastError() );
 
-      hydro_2d2<<<nblocks, nthreads_per_block>>>(zudat, zvdat,
+      hydro_2d2<HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+               <<<nblocks, nthreads_per_block>>>(zudat, zvdat,
                                                  zadat, zbdat, zzdat, zrdat,
                                                  s,
                                                  jn, kn);
       cudaErrchk( cudaGetLastError() );
 
-      hydro_2d3<<<nblocks, nthreads_per_block>>>(zroutdat, zzoutdat,
+      hydro_2d3<HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+               <<<nblocks, nthreads_per_block>>>(zroutdat, zzoutdat,
                                                  zrdat, zudat, zzdat, zvdat,
                                                  t,
                                                  jn, kn);
@@ -159,9 +173,9 @@ void HYDRO_2D::runCudaVariant(VariantID vid)
     using EXECPOL =
       RAJA::KernelPolicy<
         RAJA::statement::CudaKernelFixedAsync<j_block_sz * k_block_sz,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<k_block_sz>, 
+          RAJA::statement::Tile<0, RAJA::tile_fixed<k_block_sz>,
                                    RAJA::cuda_block_y_direct,
-            RAJA::statement::Tile<1, RAJA::tile_fixed<j_block_sz>, 
+            RAJA::statement::Tile<1, RAJA::tile_fixed<j_block_sz>,
                                    RAJA::cuda_block_x_direct,
               RAJA::statement::For<0, RAJA::cuda_thread_y_direct,   // k
                 RAJA::statement::For<1, RAJA::cuda_thread_x_direct, // j
@@ -172,7 +186,7 @@ void HYDRO_2D::runCudaVariant(VariantID vid)
           >
         >
       >;
-      
+
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
@@ -203,10 +217,12 @@ void HYDRO_2D::runCudaVariant(VariantID vid)
     HYDRO_2D_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  HYDRO_2D : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  HYDRO_2D : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(HYDRO_2D, Cuda)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/HYDRO_2D-Hip.cpp b/src/lcals/HYDRO_2D-Hip.cpp
index a492999ec..3180c5c10 100644
--- a/src/lcals/HYDRO_2D-Hip.cpp
+++ b/src/lcals/HYDRO_2D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,13 +22,17 @@ namespace lcals
 {
 
   //
-  // Define thread block size for Hip execution
+  // Define thread block shape for Hip execution
   //
-  constexpr size_t j_block_sz = 32;
-  constexpr size_t k_block_sz = 8;
+#define j_block_sz (32)
+#define k_block_sz (block_size / j_block_sz)
+
+#define HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \
+  j_block_sz, k_block_sz
 
 #define HYDRO_2D_THREADS_PER_BLOCK_HIP \
-  dim3 nthreads_per_block(j_block_sz, k_block_sz, 1);
+  dim3 nthreads_per_block(HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, 1); \
+  static_assert(j_block_sz*k_block_sz == block_size, "Invalid block_size");
 
 #define HYDRO_2D_NBLOCKS_HIP \
   dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(jn-2, j_block_sz)), \
@@ -65,41 +69,47 @@ namespace lcals
   deallocHipDeviceData(zroutdat); \
   deallocHipDeviceData(zzoutdat);
 
+template < size_t j_block_size, size_t k_block_size >
+__launch_bounds__(j_block_size*k_block_size)
 __global__ void hydro_2d1(Real_ptr zadat, Real_ptr zbdat,
                           Real_ptr zpdat, Real_ptr zqdat,
                           Real_ptr zrdat, Real_ptr zmdat,
                           Index_type jn, Index_type kn)
 {
-   Index_type k = 1 + blockIdx.y * blockDim.y + threadIdx.y;
-   Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type k = 1 + blockIdx.y * k_block_size + threadIdx.y;
+   Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x;
 
    if (k < kn-1 && j < jn-1) {
      HYDRO_2D_BODY1;
    }
 }
 
+template < size_t j_block_size, size_t k_block_size >
+__launch_bounds__(j_block_size*k_block_size)
 __global__ void hydro_2d2(Real_ptr zudat, Real_ptr zvdat,
                           Real_ptr zadat, Real_ptr zbdat,
                           Real_ptr zzdat, Real_ptr zrdat,
                           Real_type s,
                           Index_type jn, Index_type kn)
 {
-   Index_type k = 1 + blockIdx.y * blockDim.y + threadIdx.y;
-   Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type k = 1 + blockIdx.y * k_block_size + threadIdx.y;
+   Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x;
 
    if (k < kn-1 && j < jn-1) {
      HYDRO_2D_BODY2;
    }
 }
 
+template < size_t j_block_size, size_t k_block_size >
+__launch_bounds__(j_block_size*k_block_size)
 __global__ void hydro_2d3(Real_ptr zroutdat, Real_ptr zzoutdat,
                           Real_ptr zrdat, Real_ptr zudat,
                           Real_ptr zzdat, Real_ptr zvdat,
                           Real_type t,
                           Index_type jn, Index_type kn)
 {
-   Index_type k = 1 + blockIdx.y * blockDim.y + threadIdx.y;
-   Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type k = 1 + blockIdx.y * k_block_size + threadIdx.y;
+   Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x;
 
    if (k < kn-1 && j < jn-1) {
      HYDRO_2D_BODY3;
@@ -107,7 +117,8 @@ __global__ void hydro_2d3(Real_ptr zroutdat, Real_ptr zzoutdat,
 }
 
 
-void HYDRO_2D::runHipVariant(VariantID vid)
+template < size_t block_size >
+void HYDRO_2D::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type kbeg = 1;
@@ -127,14 +138,14 @@ void HYDRO_2D::runHipVariant(VariantID vid)
       HYDRO_2D_THREADS_PER_BLOCK_HIP;
       HYDRO_2D_NBLOCKS_HIP;
 
-      hipLaunchKernelGGL((hydro_2d1), 
+      hipLaunchKernelGGL((hydro_2d1<HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
                          dim3(nblocks), dim3(nthreads_per_block), 0, 0,
                          zadat, zbdat,
                          zpdat, zqdat, zrdat, zmdat,
                          jn, kn);
        hipErrchk( hipGetLastError() );
 
-       hipLaunchKernelGGL((hydro_2d2), 
+       hipLaunchKernelGGL((hydro_2d2<HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
                           dim3(nblocks), dim3(nthreads_per_block), 0, 0,
                           zudat, zvdat,
                           zadat, zbdat, zzdat, zrdat,
@@ -142,7 +153,7 @@ void HYDRO_2D::runHipVariant(VariantID vid)
                           jn, kn);
        hipErrchk( hipGetLastError() );
 
-       hipLaunchKernelGGL((hydro_2d3), 
+       hipLaunchKernelGGL((hydro_2d3<HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
                           dim3(nblocks), dim3(nthreads_per_block), 0, 0,
                           zroutdat, zzoutdat,
                           zrdat, zudat, zzdat, zvdat,
@@ -208,10 +219,12 @@ void HYDRO_2D::runHipVariant(VariantID vid)
     HYDRO_2D_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  HYDRO_2D : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  HYDRO_2D : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(HYDRO_2D, Hip)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/HYDRO_2D-OMP.cpp b/src/lcals/HYDRO_2D-OMP.cpp
index 0c2ce4001..532ee258c 100644
--- a/src/lcals/HYDRO_2D-OMP.cpp
+++ b/src/lcals/HYDRO_2D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void HYDRO_2D::runOpenMPVariant(VariantID vid)
+void HYDRO_2D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -147,19 +147,19 @@ void HYDRO_2D::runOpenMPVariant(VariantID vid)
           RAJA::kernel<EXECPOL>(
                        RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend),
                                          RAJA::RangeSegment(jbeg, jend)),
-                       hydro2d_lam1); 
+                       hydro2d_lam1);
 
           RAJA::kernel<EXECPOL>(
                        RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend),
                                          RAJA::RangeSegment(jbeg, jend)),
-                       hydro2d_lam2); 
+                       hydro2d_lam2);
 
           RAJA::kernel<EXECPOL>(
                        RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend),
                                          RAJA::RangeSegment(jbeg, jend)),
-                       hydro2d_lam3); 
+                       hydro2d_lam3);
 
-        }); // end omp parallel region 
+        }); // end omp parallel region
 
       }
       stopTimer();
@@ -168,12 +168,12 @@ void HYDRO_2D::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  HYDRO_2D : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  HYDRO_2D : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/lcals/HYDRO_2D-OMPTarget.cpp b/src/lcals/HYDRO_2D-OMPTarget.cpp
index ccd813749..18e6ff004 100644
--- a/src/lcals/HYDRO_2D-OMPTarget.cpp
+++ b/src/lcals/HYDRO_2D-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
@@ -54,7 +54,7 @@ namespace lcals
 
 
 
-void HYDRO_2D::runOpenMPTargetVariant(VariantID vid)
+void HYDRO_2D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type kbeg = 1;
@@ -73,7 +73,7 @@ void HYDRO_2D::runOpenMPTargetVariant(VariantID vid)
 
       #pragma omp target is_device_ptr(zadat, zbdat, zpdat, \
                                        zqdat, zrdat, zmdat) device( did )
-      #pragma omp teams distribute parallel for schedule(static, 1) collapse(2) 
+      #pragma omp teams distribute parallel for schedule(static, 1) collapse(2)
       for (Index_type k = kbeg; k < kend; ++k ) {
         for (Index_type j = jbeg; j < jend; ++j ) {
           HYDRO_2D_BODY1;
@@ -82,7 +82,7 @@ void HYDRO_2D::runOpenMPTargetVariant(VariantID vid)
 
       #pragma omp target is_device_ptr(zudat, zvdat, zadat, \
                                        zbdat, zzdat, zrdat) device( did )
-      #pragma omp teams distribute parallel for schedule(static, 1) collapse(2) 
+      #pragma omp teams distribute parallel for schedule(static, 1) collapse(2)
       for (Index_type k = kbeg; k < kend; ++k ) {
         for (Index_type j = jbeg; j < jend; ++j ) {
           HYDRO_2D_BODY2;
@@ -91,7 +91,7 @@ void HYDRO_2D::runOpenMPTargetVariant(VariantID vid)
 
       #pragma omp target is_device_ptr(zroutdat, zzoutdat, \
                                        zrdat, zudat, zzdat, zvdat) device( did )
-      #pragma omp teams distribute parallel for schedule(static, 1) collapse(2) 
+      #pragma omp teams distribute parallel for schedule(static, 1) collapse(2)
       for (Index_type k = kbeg; k < kend; ++k ) {
         for (Index_type j = jbeg; j < jend; ++j ) {
           HYDRO_2D_BODY3;
@@ -147,7 +147,7 @@ void HYDRO_2D::runOpenMPTargetVariant(VariantID vid)
     HYDRO_2D_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  HYDRO_2D : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  HYDRO_2D : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/lcals/HYDRO_2D-Seq.cpp b/src/lcals/HYDRO_2D-Seq.cpp
index 1a4ecd726..3db534ffc 100644
--- a/src/lcals/HYDRO_2D-Seq.cpp
+++ b/src/lcals/HYDRO_2D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void HYDRO_2D::runSeqVariant(VariantID vid)
+void HYDRO_2D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type kbeg = 1;
@@ -128,17 +128,17 @@ void HYDRO_2D::runSeqVariant(VariantID vid)
         RAJA::kernel<EXECPOL>(
                      RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend),
                                        RAJA::RangeSegment(jbeg, jend)),
-                     hydro2d_lam1); 
+                     hydro2d_lam1);
 
         RAJA::kernel<EXECPOL>(
                      RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend),
                                        RAJA::RangeSegment(jbeg, jend)),
-                     hydro2d_lam2); 
+                     hydro2d_lam2);
 
         RAJA::kernel<EXECPOL>(
                      RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend),
                                        RAJA::RangeSegment(jbeg, jend)),
-                     hydro2d_lam3); 
+                     hydro2d_lam3);
 
       }
       stopTimer();
@@ -148,7 +148,7 @@ void HYDRO_2D::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  HYDRO_2D : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  HYDRO_2D : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp
index e51237f82..9b6c2a643 100644
--- a/src/lcals/HYDRO_2D.cpp
+++ b/src/lcals/HYDRO_2D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -77,7 +77,7 @@ HYDRO_2D::~HYDRO_2D()
 {
 }
 
-void HYDRO_2D::setUp(VariantID vid)
+void HYDRO_2D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_zrout, m_array_length, 0.0, vid);
   allocAndInitDataConst(m_zzout, m_array_length, 0.0, vid);
@@ -92,13 +92,13 @@ void HYDRO_2D::setUp(VariantID vid)
   allocAndInitData(m_zz, m_array_length, vid);
 }
 
-void HYDRO_2D::updateChecksum(VariantID vid)
+void HYDRO_2D::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_zzout, m_array_length, checksum_scale_factor );
-  checksum[vid] += calcChecksum(m_zrout, m_array_length, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_zzout, m_array_length, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_zrout, m_array_length, checksum_scale_factor );
 }
 
-void HYDRO_2D::tearDown(VariantID vid)
+void HYDRO_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_zrout);
diff --git a/src/lcals/HYDRO_2D.hpp b/src/lcals/HYDRO_2D.hpp
index 2525c8c89..4363ea633 100644
--- a/src/lcals/HYDRO_2D.hpp
+++ b/src/lcals/HYDRO_2D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -144,17 +144,28 @@ class HYDRO_2D : public KernelBase
 
   ~HYDRO_2D();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
+                                                         gpu_block_size::MultipleOf<32>>;
+
   Real_ptr m_za;
   Real_ptr m_zb;
   Real_ptr m_zm;
diff --git a/src/lcals/INT_PREDICT-Cuda.cpp b/src/lcals/INT_PREDICT-Cuda.cpp
index 651f5f862..aaed2219e 100644
--- a/src/lcals/INT_PREDICT-Cuda.cpp
+++ b/src/lcals/INT_PREDICT-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace lcals
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define INT_PREDICT_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(px, m_px, m_array_length);
 
@@ -34,6 +28,8 @@ namespace lcals
   getCudaDeviceData(m_px, px, m_array_length); \
   deallocCudaDeviceData(px);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void int_predict(Real_ptr px,
                             Real_type dm22, Real_type dm23, Real_type dm24,
                             Real_type dm25, Real_type dm26, Real_type dm27,
@@ -41,14 +37,15 @@ __global__ void int_predict(Real_ptr px,
                             const Index_type offset,
                             Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      INT_PREDICT_BODY;
    }
 }
 
 
-void INT_PREDICT::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void INT_PREDICT::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -64,7 +61,7 @@ void INT_PREDICT::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       int_predict<<<grid_size, block_size>>>( px,
+       int_predict<block_size><<<grid_size, block_size>>>( px,
                                                dm22, dm23, dm24, dm25,
                                                dm26, dm27, dm28, c0,
                                                offset,
@@ -94,10 +91,12 @@ void INT_PREDICT::runCudaVariant(VariantID vid)
     INT_PREDICT_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  INT_PREDICT : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  INT_PREDICT : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INT_PREDICT, Cuda)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/INT_PREDICT-Hip.cpp b/src/lcals/INT_PREDICT-Hip.cpp
index 8d758f70d..22914bff3 100644
--- a/src/lcals/INT_PREDICT-Hip.cpp
+++ b/src/lcals/INT_PREDICT-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace lcals
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define INT_PREDICT_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(px, m_px, m_array_length);
 
@@ -34,6 +28,8 @@ namespace lcals
   getHipDeviceData(m_px, px, m_array_length); \
   deallocHipDeviceData(px);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void int_predict(Real_ptr px,
                             Real_type dm22, Real_type dm23, Real_type dm24,
                             Real_type dm25, Real_type dm26, Real_type dm27,
@@ -41,14 +37,15 @@ __global__ void int_predict(Real_ptr px,
                             const Index_type offset,
                             Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      INT_PREDICT_BODY;
    }
 }
 
 
-void INT_PREDICT::runHipVariant(VariantID vid)
+template < size_t block_size >
+void INT_PREDICT::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -64,7 +61,7 @@ void INT_PREDICT::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       hipLaunchKernelGGL((int_predict), dim3(grid_size), dim3(block_size), 0, 0,  px,
+       hipLaunchKernelGGL((int_predict<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  px,
                                                dm22, dm23, dm24, dm25,
                                                dm26, dm27, dm28, c0,
                                                offset,
@@ -94,10 +91,12 @@ void INT_PREDICT::runHipVariant(VariantID vid)
     INT_PREDICT_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  INT_PREDICT : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  INT_PREDICT : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INT_PREDICT, Hip)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/INT_PREDICT-OMP.cpp b/src/lcals/INT_PREDICT-OMP.cpp
index a5f8512af..4e67db9c4 100644
--- a/src/lcals/INT_PREDICT-OMP.cpp
+++ b/src/lcals/INT_PREDICT-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void INT_PREDICT::runOpenMPVariant(VariantID vid)
+void INT_PREDICT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -81,12 +81,12 @@ void INT_PREDICT::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  INT_PREDICT : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  INT_PREDICT : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/lcals/INT_PREDICT-OMPTarget.cpp b/src/lcals/INT_PREDICT-OMPTarget.cpp
index 44cceb4a7..86b8b8169 100644
--- a/src/lcals/INT_PREDICT-OMPTarget.cpp
+++ b/src/lcals/INT_PREDICT-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
@@ -37,7 +37,7 @@ namespace lcals
   deallocOpenMPDeviceData(px, did);
 
 
-void INT_PREDICT::runOpenMPTargetVariant(VariantID vid)
+void INT_PREDICT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -48,12 +48,12 @@ void INT_PREDICT::runOpenMPTargetVariant(VariantID vid)
   if ( vid == Base_OpenMPTarget ) {
 
     INT_PREDICT_DATA_SETUP_OMP_TARGET;
-                              
+
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       #pragma omp target is_device_ptr(px) device( did )
-      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) 
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
       for (Index_type i = ibegin; i < iend; ++i ) {
         INT_PREDICT_BODY;
       }
@@ -62,13 +62,13 @@ void INT_PREDICT::runOpenMPTargetVariant(VariantID vid)
     stopTimer();
 
     INT_PREDICT_DATA_TEARDOWN_OMP_TARGET;
-                              
+
   } else if ( vid == RAJA_OpenMPTarget ) {
 
     INT_PREDICT_DATA_SETUP_OMP_TARGET;
 
     startTimer();
-    for (RepIndex_type irep = 0; irep < run_reps; ++irep) { 
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
         RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
@@ -81,7 +81,7 @@ void INT_PREDICT::runOpenMPTargetVariant(VariantID vid)
     INT_PREDICT_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  INT_PREDICT : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  INT_PREDICT : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/lcals/INT_PREDICT-Seq.cpp b/src/lcals/INT_PREDICT-Seq.cpp
index de69a7d4b..83a41071b 100644
--- a/src/lcals/INT_PREDICT-Seq.cpp
+++ b/src/lcals/INT_PREDICT-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void INT_PREDICT::runSeqVariant(VariantID vid)
+void INT_PREDICT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -79,7 +79,7 @@ void INT_PREDICT::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  INT_PREDICT : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  INT_PREDICT : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp
index 096a074ac..c2062fffa 100644
--- a/src/lcals/INT_PREDICT.cpp
+++ b/src/lcals/INT_PREDICT.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -55,7 +55,7 @@ INT_PREDICT::~INT_PREDICT()
 {
 }
 
-void INT_PREDICT::setUp(VariantID vid)
+void INT_PREDICT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   m_array_length = getActualProblemSize() * 13;
   m_offset = getActualProblemSize();
@@ -73,16 +73,16 @@ void INT_PREDICT::setUp(VariantID vid)
   initData(m_c0);
 }
 
-void INT_PREDICT::updateChecksum(VariantID vid)
+void INT_PREDICT::updateChecksum(VariantID vid, size_t tune_idx)
 {
   for (Index_type i = 0; i < getActualProblemSize(); ++i) {
     m_px[i] -= m_px_initval;
   }
 
-  checksum[vid] += calcChecksum(m_px, getActualProblemSize());
+  checksum[vid][tune_idx] += calcChecksum(m_px, getActualProblemSize());
 }
 
-void INT_PREDICT::tearDown(VariantID vid)
+void INT_PREDICT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_px);
diff --git a/src/lcals/INT_PREDICT.hpp b/src/lcals/INT_PREDICT.hpp
index 1253e1a6e..7a3c6fda6 100644
--- a/src/lcals/INT_PREDICT.hpp
+++ b/src/lcals/INT_PREDICT.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -63,17 +63,27 @@ class INT_PREDICT : public KernelBase
 
   ~INT_PREDICT();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Index_type m_array_length;
   Index_type m_offset;
 
diff --git a/src/lcals/PLANCKIAN-Cuda.cpp b/src/lcals/PLANCKIAN-Cuda.cpp
index fd46a4fdf..c831aab2e 100644
--- a/src/lcals/PLANCKIAN-Cuda.cpp
+++ b/src/lcals/PLANCKIAN-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,12 +22,6 @@ namespace rajaperf
 namespace lcals
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define PLANCKIAN_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(x, m_x, iend); \
   allocAndInitCudaDeviceData(y, m_y, iend); \
@@ -43,18 +37,21 @@ namespace lcals
   deallocCudaDeviceData(v); \
   deallocCudaDeviceData(w);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void planckian(Real_ptr x, Real_ptr y,
                           Real_ptr u, Real_ptr v, Real_ptr w,
                           Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      PLANCKIAN_BODY;
    }
 }
 
 
-void PLANCKIAN::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void PLANCKIAN::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -70,7 +67,7 @@ void PLANCKIAN::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       planckian<<<grid_size, block_size>>>( x, y,
+       planckian<block_size><<<grid_size, block_size>>>( x, y,
                                              u, v, w,
                                              iend );
        cudaErrchk( cudaGetLastError() );
@@ -98,10 +95,12 @@ void PLANCKIAN::runCudaVariant(VariantID vid)
     PLANCKIAN_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  PLANCKIAN : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  PLANCKIAN : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(PLANCKIAN, Cuda)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/PLANCKIAN-Hip.cpp b/src/lcals/PLANCKIAN-Hip.cpp
index f47d04ce9..1b8c6050b 100644
--- a/src/lcals/PLANCKIAN-Hip.cpp
+++ b/src/lcals/PLANCKIAN-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,12 +22,6 @@ namespace rajaperf
 namespace lcals
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define PLANCKIAN_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(x, m_x, iend); \
   allocAndInitHipDeviceData(y, m_y, iend); \
@@ -43,18 +37,21 @@ namespace lcals
   deallocHipDeviceData(v); \
   deallocHipDeviceData(w);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void planckian(Real_ptr x, Real_ptr y,
                           Real_ptr u, Real_ptr v, Real_ptr w,
                           Index_type iend)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i < iend) {
      PLANCKIAN_BODY;
    }
 }
 
 
-void PLANCKIAN::runHipVariant(VariantID vid)
+template < size_t block_size >
+void PLANCKIAN::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -70,7 +67,7 @@ void PLANCKIAN::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       hipLaunchKernelGGL((planckian), dim3(grid_size), dim3(block_size), 0, 0,  x, y,
+       hipLaunchKernelGGL((planckian<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  x, y,
                                              u, v, w,
                                              iend );
        hipErrchk( hipGetLastError() );
@@ -98,10 +95,12 @@ void PLANCKIAN::runHipVariant(VariantID vid)
     PLANCKIAN_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  PLANCKIAN : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  PLANCKIAN : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(PLANCKIAN, Hip)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/PLANCKIAN-OMP.cpp b/src/lcals/PLANCKIAN-OMP.cpp
index 01c76f5ed..e802a96fd 100644
--- a/src/lcals/PLANCKIAN-OMP.cpp
+++ b/src/lcals/PLANCKIAN-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,13 +13,13 @@
 #include <iostream>
 #include <cmath>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void PLANCKIAN::runOpenMPVariant(VariantID vid)
+void PLANCKIAN::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -82,12 +82,12 @@ void PLANCKIAN::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  PLANCKIAN : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  PLANCKIAN : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/lcals/PLANCKIAN-OMPTarget.cpp b/src/lcals/PLANCKIAN-OMPTarget.cpp
index 61fa12b1d..02858604f 100644
--- a/src/lcals/PLANCKIAN-OMPTarget.cpp
+++ b/src/lcals/PLANCKIAN-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -17,7 +17,7 @@
 #include <iostream>
 #include <cmath>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
@@ -46,7 +46,7 @@ namespace lcals
   deallocOpenMPDeviceData(w, did);
 
 
-void PLANCKIAN::runOpenMPTargetVariant(VariantID vid)
+void PLANCKIAN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -57,12 +57,12 @@ void PLANCKIAN::runOpenMPTargetVariant(VariantID vid)
   if ( vid == Base_OpenMPTarget ) {
 
     PLANCKIAN_DATA_SETUP_OMP_TARGET;
-                              
+
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       #pragma omp target is_device_ptr(x, y, u, v, w) device( did )
-      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) 
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
       for (Index_type i = ibegin; i < iend; ++i ) {
         PLANCKIAN_BODY;
       }
@@ -70,12 +70,12 @@ void PLANCKIAN::runOpenMPTargetVariant(VariantID vid)
     }
     stopTimer();
 
-    PLANCKIAN_DATA_TEARDOWN_OMP_TARGET; 
+    PLANCKIAN_DATA_TEARDOWN_OMP_TARGET;
 
   } else if ( vid == RAJA_OpenMPTarget ) {
 
-    PLANCKIAN_DATA_SETUP_OMP_TARGET; 
-                              
+    PLANCKIAN_DATA_SETUP_OMP_TARGET;
+
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
@@ -87,10 +87,10 @@ void PLANCKIAN::runOpenMPTargetVariant(VariantID vid)
     }
     stopTimer();
 
-    PLANCKIAN_DATA_TEARDOWN_OMP_TARGET; 
+    PLANCKIAN_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  PLANCKIAN : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  PLANCKIAN : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/lcals/PLANCKIAN-Seq.cpp b/src/lcals/PLANCKIAN-Seq.cpp
index fa5cb565e..efd372444 100644
--- a/src/lcals/PLANCKIAN-Seq.cpp
+++ b/src/lcals/PLANCKIAN-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,13 +13,13 @@
 #include <iostream>
 #include <cmath>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void PLANCKIAN::runSeqVariant(VariantID vid)
+void PLANCKIAN::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -80,7 +80,7 @@ void PLANCKIAN::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  PLANCKIAN : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  PLANCKIAN : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp
index 564a71a7e..59de57231 100644
--- a/src/lcals/PLANCKIAN.cpp
+++ b/src/lcals/PLANCKIAN.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -55,7 +55,7 @@ PLANCKIAN::~PLANCKIAN()
 {
 }
 
-void PLANCKIAN::setUp(VariantID vid)
+void PLANCKIAN::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitData(m_x, getActualProblemSize(), vid);
   allocAndInitData(m_y, getActualProblemSize(), vid);
@@ -64,12 +64,12 @@ void PLANCKIAN::setUp(VariantID vid)
   allocAndInitDataConst(m_w, getActualProblemSize(), 0.0, vid);
 }
 
-void PLANCKIAN::updateChecksum(VariantID vid)
+void PLANCKIAN::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_w, getActualProblemSize());
+  checksum[vid][tune_idx] += calcChecksum(m_w, getActualProblemSize());
 }
 
-void PLANCKIAN::tearDown(VariantID vid)
+void PLANCKIAN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_x);
diff --git a/src/lcals/PLANCKIAN.hpp b/src/lcals/PLANCKIAN.hpp
index 1e5b744db..46fba63db 100644
--- a/src/lcals/PLANCKIAN.hpp
+++ b/src/lcals/PLANCKIAN.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -48,17 +48,27 @@ class PLANCKIAN : public KernelBase
 
   ~PLANCKIAN();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_x;
   Real_ptr m_y;
   Real_ptr m_u;
diff --git a/src/lcals/TRIDIAG_ELIM-Cuda.cpp b/src/lcals/TRIDIAG_ELIM-Cuda.cpp
index b06884f0e..654d027a9 100644
--- a/src/lcals/TRIDIAG_ELIM-Cuda.cpp
+++ b/src/lcals/TRIDIAG_ELIM-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace lcals
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define TRIDIAG_ELIM_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(xout, m_xout, m_N); \
   allocAndInitCudaDeviceData(xin, m_xin, m_N); \
@@ -40,17 +34,20 @@ namespace lcals
   deallocCudaDeviceData(y); \
   deallocCudaDeviceData(z);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void eos(Real_ptr xout, Real_ptr xin, Real_ptr y, Real_ptr z,
                     Index_type N)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i > 0 && i < N) {
      TRIDIAG_ELIM_BODY;
    }
 }
 
 
-void TRIDIAG_ELIM::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void TRIDIAG_ELIM::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 1;
@@ -66,7 +63,8 @@ void TRIDIAG_ELIM::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       eos<<<grid_size, block_size>>>( xout, xin, y, z,
+       eos<block_size>
+          <<<grid_size, block_size>>>( xout, xin, y, z,
                                        iend );
        cudaErrchk( cudaGetLastError() );
 
@@ -93,10 +91,12 @@ void TRIDIAG_ELIM::runCudaVariant(VariantID vid)
     TRIDIAG_ELIM_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  TRIDIAG_ELIM : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  TRIDIAG_ELIM : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(TRIDIAG_ELIM, Cuda)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/TRIDIAG_ELIM-Hip.cpp b/src/lcals/TRIDIAG_ELIM-Hip.cpp
index 66ae4bad5..dab19cc07 100644
--- a/src/lcals/TRIDIAG_ELIM-Hip.cpp
+++ b/src/lcals/TRIDIAG_ELIM-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace lcals
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define TRIDIAG_ELIM_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(xout, m_xout, m_N); \
   allocAndInitHipDeviceData(xin, m_xin, m_N); \
@@ -40,17 +34,20 @@ namespace lcals
   deallocHipDeviceData(y); \
   deallocHipDeviceData(z);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void eos(Real_ptr xout, Real_ptr xin, Real_ptr y, Real_ptr z,
                     Index_type N)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i > 0 && i < N) {
      TRIDIAG_ELIM_BODY;
    }
 }
 
 
-void TRIDIAG_ELIM::runHipVariant(VariantID vid)
+template < size_t block_size >
+void TRIDIAG_ELIM::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 1;
@@ -66,7 +63,7 @@ void TRIDIAG_ELIM::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       hipLaunchKernelGGL(eos, grid_size, block_size, 0, 0, xout, xin, y, z,
+       hipLaunchKernelGGL((eos<block_size>), grid_size, block_size, 0, 0, xout, xin, y, z,
                                        iend );
        hipErrchk( hipGetLastError() );
 
@@ -93,10 +90,12 @@ void TRIDIAG_ELIM::runHipVariant(VariantID vid)
     TRIDIAG_ELIM_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  TRIDIAG_ELIM : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  TRIDIAG_ELIM : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(TRIDIAG_ELIM, Hip)
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/TRIDIAG_ELIM-OMP.cpp b/src/lcals/TRIDIAG_ELIM-OMP.cpp
index b1773ee33..a78c4a210 100644
--- a/src/lcals/TRIDIAG_ELIM-OMP.cpp
+++ b/src/lcals/TRIDIAG_ELIM-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void TRIDIAG_ELIM::runOpenMPVariant(VariantID vid)
+void TRIDIAG_ELIM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -81,12 +81,12 @@ void TRIDIAG_ELIM::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  TRIDIAG_ELIM : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  TRIDIAG_ELIM : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp b/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp
index 523b47dd9..ff21303da 100644
--- a/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp
+++ b/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
@@ -43,7 +43,7 @@ namespace lcals
   deallocOpenMPDeviceData(z, did);
 
 
-void TRIDIAG_ELIM::runOpenMPTargetVariant(VariantID vid)
+void TRIDIAG_ELIM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 1;
@@ -59,7 +59,7 @@ void TRIDIAG_ELIM::runOpenMPTargetVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       #pragma omp target is_device_ptr(xout, xin, y, z) device( did )
-      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) 
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
       for (Index_type i = ibegin; i < iend; ++i ) {
         TRIDIAG_ELIM_BODY;
       }
@@ -79,15 +79,15 @@ void TRIDIAG_ELIM::runOpenMPTargetVariant(VariantID vid)
       RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
         RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
         TRIDIAG_ELIM_BODY;
-      });  
+      });
 
     }
     stopTimer();
 
     TRIDIAG_ELIM_DATA_TEARDOWN_OMP_TARGET
 
-  } else { 
-     std::cout << "\n  TRIDIAG_ELIM : Unknown OMP Tagretvariant id = " << vid << std::endl;
+  } else {
+     getCout() << "\n  TRIDIAG_ELIM : Unknown OMP Tagretvariant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/lcals/TRIDIAG_ELIM-Seq.cpp b/src/lcals/TRIDIAG_ELIM-Seq.cpp
index 60303d353..8aa6dc451 100644
--- a/src/lcals/TRIDIAG_ELIM-Seq.cpp
+++ b/src/lcals/TRIDIAG_ELIM-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace lcals
 {
 
 
-void TRIDIAG_ELIM::runSeqVariant(VariantID vid)
+void TRIDIAG_ELIM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 1;
@@ -79,7 +79,7 @@ void TRIDIAG_ELIM::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  TRIDIAG_ELIM : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  TRIDIAG_ELIM : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp
index d35c08a51..05d0100a8 100644
--- a/src/lcals/TRIDIAG_ELIM.cpp
+++ b/src/lcals/TRIDIAG_ELIM.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -57,7 +57,7 @@ TRIDIAG_ELIM::~TRIDIAG_ELIM()
 {
 }
 
-void TRIDIAG_ELIM::setUp(VariantID vid)
+void TRIDIAG_ELIM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_xout, m_N, 0.0, vid);
   allocAndInitData(m_xin, m_N, vid);
@@ -65,12 +65,12 @@ void TRIDIAG_ELIM::setUp(VariantID vid)
   allocAndInitData(m_z, m_N, vid);
 }
 
-void TRIDIAG_ELIM::updateChecksum(VariantID vid)
+void TRIDIAG_ELIM::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_xout, getActualProblemSize());
+  checksum[vid][tune_idx] += calcChecksum(m_xout, getActualProblemSize());
 }
 
-void TRIDIAG_ELIM::tearDown(VariantID vid)
+void TRIDIAG_ELIM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_xout);
diff --git a/src/lcals/TRIDIAG_ELIM.hpp b/src/lcals/TRIDIAG_ELIM.hpp
index 73ffeb341..f593985a5 100644
--- a/src/lcals/TRIDIAG_ELIM.hpp
+++ b/src/lcals/TRIDIAG_ELIM.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -48,17 +48,27 @@ class TRIDIAG_ELIM : public KernelBase
 
   ~TRIDIAG_ELIM();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_xout;
   Real_ptr m_xin;
   Real_ptr m_y;
diff --git a/src/polybench/CMakeLists.txt b/src/polybench/CMakeLists.txt
index fec49e204..5805926f3 100644
--- a/src/polybench/CMakeLists.txt
+++ b/src/polybench/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
diff --git a/src/polybench/POLYBENCH_2MM-Cuda.cpp b/src/polybench/POLYBENCH_2MM-Cuda.cpp
index f6165d74c..40b1f5ca3 100644
--- a/src/polybench/POLYBENCH_2MM-Cuda.cpp
+++ b/src/polybench/POLYBENCH_2MM-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,13 +22,16 @@ namespace polybench
 {
 
 //
-// Define thread block size for CUDA execution
+// Define thread block shape for CUDA execution
 //
-constexpr size_t out_block_sz = 8;
-constexpr size_t in_block_sz = 32;
+#define in_block_sz (32)
+#define out_block_sz (block_size / in_block_sz)
+
+#define POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \
+  in_block_sz, out_block_sz
 
 #define POLY_2MM_THREADS_PER_BLOCK_CUDA \
-  dim3 nthreads_per_block(in_block_sz, out_block_sz, 1);
+  dim3 nthreads_per_block(POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA, 1);
 
 #define POLY_2MM_1_NBLOCKS_CUDA \
   dim3 nblocks1(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(nj, in_block_sz)), \
@@ -58,12 +61,14 @@ constexpr size_t in_block_sz = 32;
   deallocCudaDeviceData(D);
 
 
+template < size_t in_block_size, size_t out_block_size >
+__launch_bounds__(in_block_size*out_block_size)
 __global__ void poly_2mm_1(Real_ptr tmp, Real_ptr A, Real_ptr B,
                            Real_type alpha,
                            Index_type ni, Index_type nj, Index_type nk)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * out_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * in_block_size + threadIdx.x;
 
   if ( i < ni && j < nj ) {
     POLYBENCH_2MM_BODY1;
@@ -74,24 +79,27 @@ __global__ void poly_2mm_1(Real_ptr tmp, Real_ptr A, Real_ptr B,
   }
 }
 
-template< typename Lambda >
+template < size_t in_block_size, size_t out_block_size, typename Lambda >
+__launch_bounds__(in_block_size*out_block_size)
 __global__ void poly_2mm_1_lam(Index_type ni, Index_type nj,
                                Lambda body)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * out_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * in_block_size + threadIdx.x;
 
   if ( i < ni && j < nj ) {
-    body(i, j); 
+    body(i, j);
   }
 }
 
+template < size_t in_block_size, size_t out_block_size >
+__launch_bounds__(in_block_size*out_block_size)
 __global__ void poly_2mm_2(Real_ptr tmp, Real_ptr C, Real_ptr D,
                            Real_type beta,
                            Index_type ni,  Index_type nl, Index_type nj)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type l = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * out_block_size + threadIdx.y;
+  Index_type l = blockIdx.x * in_block_size + threadIdx.x;
 
   if ( i < ni && l < nl ) {
     POLYBENCH_2MM_BODY4;
@@ -102,12 +110,13 @@ __global__ void poly_2mm_2(Real_ptr tmp, Real_ptr C, Real_ptr D,
   }
 }
 
-template< typename Lambda >
+template < size_t in_block_size, size_t out_block_size, typename Lambda >
+__launch_bounds__(in_block_size*out_block_size)
 __global__ void poly_2mm_2_lam(Index_type ni,  Index_type nl,
                                Lambda body)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type l = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * out_block_size + threadIdx.y;
+  Index_type l = blockIdx.x * in_block_size + threadIdx.x;
 
   if ( i < ni && l < nl ) {
     body(i, l);
@@ -115,7 +124,8 @@ __global__ void poly_2mm_2_lam(Index_type ni,  Index_type nl,
 }
 
 
-void POLYBENCH_2MM::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -131,12 +141,14 @@ void POLYBENCH_2MM::runCudaVariant(VariantID vid)
       POLY_2MM_THREADS_PER_BLOCK_CUDA;
 
       POLY_2MM_1_NBLOCKS_CUDA;
-      poly_2mm_1<<<nblocks1, nthreads_per_block>>>(tmp, A, B, alpha,
+      poly_2mm_1<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                <<<nblocks1, nthreads_per_block>>>(tmp, A, B, alpha,
                                                    ni, nj, nk);
       cudaErrchk( cudaGetLastError() );
 
       POLY_2MM_2_NBLOCKS_CUDA;
-      poly_2mm_2<<<nblocks2, nthreads_per_block>>>(tmp, C, D, beta,
+      poly_2mm_2<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                <<<nblocks2, nthreads_per_block>>>(tmp, C, D, beta,
                                                    ni, nl, nj);
       cudaErrchk( cudaGetLastError() );
 
@@ -155,7 +167,8 @@ void POLYBENCH_2MM::runCudaVariant(VariantID vid)
       POLY_2MM_THREADS_PER_BLOCK_CUDA;
 
       POLY_2MM_1_NBLOCKS_CUDA;
-      poly_2mm_1_lam<<<nblocks1, nthreads_per_block>>>(ni, nj,
+      poly_2mm_1_lam<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                    <<<nblocks1, nthreads_per_block>>>(ni, nj,
         [=] __device__ (Index_type i, Index_type j) {
           POLYBENCH_2MM_BODY1;
           for (Index_type k=0; k < nk; ++k) {
@@ -167,7 +180,8 @@ void POLYBENCH_2MM::runCudaVariant(VariantID vid)
       cudaErrchk( cudaGetLastError() );
 
       POLY_2MM_2_NBLOCKS_CUDA;
-      poly_2mm_2_lam<<<nblocks2, nthreads_per_block>>>(ni, nl,
+      poly_2mm_2_lam<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                    <<<nblocks2, nthreads_per_block>>>(ni, nl,
         [=] __device__ (Index_type i, Index_type l) {
           POLYBENCH_2MM_BODY4;
           for (Index_type j=0; j < nj; ++j) {
@@ -257,11 +271,12 @@ void POLYBENCH_2MM::runCudaVariant(VariantID vid)
     POLYBENCH_2MM_TEARDOWN_CUDA;
 
   } else {
-      std::cout << "\n  POLYBENCH_2MM : Unknown Cuda variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_2MM : Unknown Cuda variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_2MM, Cuda)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_2MM-Hip.cpp b/src/polybench/POLYBENCH_2MM-Hip.cpp
index c3c9869b4..15ffa80df 100644
--- a/src/polybench/POLYBENCH_2MM-Hip.cpp
+++ b/src/polybench/POLYBENCH_2MM-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,10 +22,13 @@ namespace polybench
 {
 
 //
-// Define thread block size for Hip execution
+// Define thread block shape for Hip execution
 //
-constexpr size_t out_block_sz = 8;
-constexpr size_t in_block_sz = 32;
+#define in_block_sz (32)
+#define out_block_sz (block_size / in_block_sz)
+
+#define POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \
+  in_block_sz, out_block_sz
 
 #define POLY_2MM_THREADS_PER_BLOCK_HIP \
   dim3 nthreads_per_block(in_block_sz, out_block_sz, 1);
@@ -57,12 +60,14 @@ constexpr size_t in_block_sz = 32;
   deallocHipDeviceData(C); \
   deallocHipDeviceData(D);
 
+template < size_t in_block_size, size_t out_block_size >
+__launch_bounds__(in_block_size*out_block_size)
 __global__ void poly_2mm_1(Real_ptr tmp, Real_ptr A, Real_ptr B,
                            Real_type alpha,
                            Index_type ni, Index_type nj, Index_type nk)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * out_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * in_block_size + threadIdx.x;
 
   if ( i < ni && j < nj ) {
     POLYBENCH_2MM_BODY1;
@@ -73,24 +78,27 @@ __global__ void poly_2mm_1(Real_ptr tmp, Real_ptr A, Real_ptr B,
   }
 }
 
-template< typename Lambda >
+template < size_t in_block_size, size_t out_block_size, typename Lambda >
+__launch_bounds__(in_block_size*out_block_size)
 __global__ void poly_2mm_1_lam(Index_type ni, Index_type nj,
                                Lambda body)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * out_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * in_block_size + threadIdx.x;
 
   if ( i < ni && j < nj ) {
     body(i, j);
   }
 }
 
+template < size_t in_block_size, size_t out_block_size >
+__launch_bounds__(in_block_size*out_block_size)
 __global__ void poly_2mm_2(Real_ptr tmp, Real_ptr C, Real_ptr D,
                            Real_type beta,
                            Index_type ni,  Index_type nl, Index_type nj)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type l = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * out_block_size + threadIdx.y;
+  Index_type l = blockIdx.x * in_block_size + threadIdx.x;
 
   if ( i < ni && l < nl ) {
     POLYBENCH_2MM_BODY4;
@@ -101,12 +109,13 @@ __global__ void poly_2mm_2(Real_ptr tmp, Real_ptr C, Real_ptr D,
   }
 }
 
-template< typename Lambda >
+template < size_t in_block_size, size_t out_block_size, typename Lambda >
+__launch_bounds__(in_block_size*out_block_size)
 __global__ void poly_2mm_2_lam(Index_type ni,  Index_type nl,
                                Lambda body)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type l = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * out_block_size + threadIdx.y;
+  Index_type l = blockIdx.x * in_block_size + threadIdx.x;
 
   if ( i < ni && l < nl ) {
     body(i, l);
@@ -114,7 +123,8 @@ __global__ void poly_2mm_2_lam(Index_type ni,  Index_type nl,
 }
 
 
-void POLYBENCH_2MM::runHipVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_2MM::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -130,14 +140,14 @@ void POLYBENCH_2MM::runHipVariant(VariantID vid)
       POLY_2MM_THREADS_PER_BLOCK_HIP;
 
       POLY_2MM_1_NBLOCKS_HIP;
-      hipLaunchKernelGGL((poly_2mm_1), 
+      hipLaunchKernelGGL((poly_2mm_1<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
                          dim3(nblocks1), dim3(nthreads_per_block), 0, 0,
                          tmp, A, B, alpha,
                          ni, nj, nk);
       hipErrchk( hipGetLastError() );
 
       POLY_2MM_2_NBLOCKS_HIP;
-      hipLaunchKernelGGL((poly_2mm_2), 
+      hipLaunchKernelGGL((poly_2mm_2<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
                          dim3(nblocks2), dim3(nthreads_per_block), 0, 0,
                          tmp, C, D, beta,
                          ni, nl, nj);
@@ -165,12 +175,12 @@ void POLYBENCH_2MM::runHipVariant(VariantID vid)
         POLYBENCH_2MM_BODY3;
       };
 
-      POLY_2MM_1_NBLOCKS_HIP;      
-      hipLaunchKernelGGL((poly_2mm_1_lam<decltype(poly_2mm_1_lambda)>),
+      POLY_2MM_1_NBLOCKS_HIP;
+      hipLaunchKernelGGL((poly_2mm_1_lam<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_2mm_1_lambda)>),
                          dim3(nblocks1), dim3(nthreads_per_block), 0, 0,
                          ni, nj, poly_2mm_1_lambda);
       hipErrchk( hipGetLastError() );
- 
+
       auto poly_2mm_2_lambda = [=] __device__ (Index_type i, Index_type l) {
         POLYBENCH_2MM_BODY4;
         for (Index_type j=0; j < nj; ++j) {
@@ -180,7 +190,7 @@ void POLYBENCH_2MM::runHipVariant(VariantID vid)
       };
 
       POLY_2MM_2_NBLOCKS_HIP;
-      hipLaunchKernelGGL((poly_2mm_2_lam<decltype(poly_2mm_2_lambda)>),
+      hipLaunchKernelGGL((poly_2mm_2_lam<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_2mm_2_lambda)>),
                          dim3(nblocks2), dim3(nthreads_per_block), 0, 0,
                          ni, nl, poly_2mm_2_lambda);
       hipErrchk( hipGetLastError() );
@@ -264,11 +274,12 @@ void POLYBENCH_2MM::runHipVariant(VariantID vid)
     POLYBENCH_2MM_TEARDOWN_HIP;
 
   } else {
-      std::cout << "\n  POLYBENCH_2MM : Unknown Hip variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_2MM : Unknown Hip variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_2MM, Hip)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_2MM-OMP.cpp b/src/polybench/POLYBENCH_2MM-OMP.cpp
index 2fed3550a..687f93c45 100644
--- a/src/polybench/POLYBENCH_2MM-OMP.cpp
+++ b/src/polybench/POLYBENCH_2MM-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -20,13 +20,13 @@
 //#undef USE_RAJA_OMP_COLLAPSE
 
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
 
-void POLYBENCH_2MM::runOpenMPVariant(VariantID vid)
+void POLYBENCH_2MM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -45,7 +45,7 @@ void POLYBENCH_2MM::runOpenMPVariant(VariantID vid)
         #pragma omp parallel for collapse(2)
 #else
         #pragma omp parallel for
-#endif 
+#endif
         for (Index_type i = 0; i < ni; i++ ) {
           for(Index_type j = 0; j < nj; j++) {
             POLYBENCH_2MM_BODY1;
@@ -60,7 +60,7 @@ void POLYBENCH_2MM::runOpenMPVariant(VariantID vid)
         #pragma omp parallel for collapse(2)
 #else
         #pragma omp parallel for
-#endif 
+#endif
         for(Index_type i = 0; i < ni; i++) {
           for(Index_type l = 0; l < nl; l++) {
             POLYBENCH_2MM_BODY4;
@@ -142,7 +142,7 @@ void POLYBENCH_2MM::runOpenMPVariant(VariantID vid)
       auto poly_2mm_lam1 = [=](Real_type &dot) {
                              POLYBENCH_2MM_BODY1_RAJA;
                            };
-      auto poly_2mm_lam2 = [=](Index_type i, Index_type j, Index_type k, 
+      auto poly_2mm_lam2 = [=](Index_type i, Index_type j, Index_type k,
                                Real_type &dot) {
                              POLYBENCH_2MM_BODY2_RAJA;
                            };
@@ -153,7 +153,7 @@ void POLYBENCH_2MM::runOpenMPVariant(VariantID vid)
       auto poly_2mm_lam4 = [=](Real_type &dot) {
                              POLYBENCH_2MM_BODY4_RAJA;
                            };
-      auto poly_2mm_lam5 = [=](Index_type i, Index_type l, Index_type j, 
+      auto poly_2mm_lam5 = [=](Index_type i, Index_type l, Index_type j,
                                Real_type &dot) {
                              POLYBENCH_2MM_BODY5_RAJA;
                            };
@@ -192,7 +192,7 @@ void POLYBENCH_2MM::runOpenMPVariant(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::kernel_param<EXEC_POL>( 
+        RAJA::kernel_param<EXEC_POL>(
           RAJA::make_tuple(RAJA::RangeSegment{0, ni},
                            RAJA::RangeSegment{0, nj},
                            RAJA::RangeSegment{0, nk}),
@@ -203,7 +203,7 @@ void POLYBENCH_2MM::runOpenMPVariant(VariantID vid)
           poly_2mm_lam3
         );
 
-        RAJA::kernel_param<EXEC_POL>( 
+        RAJA::kernel_param<EXEC_POL>(
           RAJA::make_tuple(RAJA::RangeSegment{0, ni},
                            RAJA::RangeSegment{0, nl},
                            RAJA::RangeSegment{0, nj}),
@@ -221,12 +221,12 @@ void POLYBENCH_2MM::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  POLYBENCH_2MM : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_2MM : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp
index ab68c8e2a..ab7860935 100644
--- a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -29,7 +29,7 @@ namespace polybench
   allocAndInitOpenMPDeviceData(A, m_A, m_ni * m_nk, did, hid); \
   allocAndInitOpenMPDeviceData(B, m_B, m_nk * m_nj, did, hid); \
   allocAndInitOpenMPDeviceData(C, m_C, m_nj * m_nl, did, hid); \
-  allocAndInitOpenMPDeviceData(D, m_D, m_ni * m_nl, did, hid); 
+  allocAndInitOpenMPDeviceData(D, m_D, m_ni * m_nl, did, hid);
 
 
 #define POLYBENCH_2MM_DATA_TEARDOWN_OMP_TARGET \
@@ -41,7 +41,7 @@ namespace polybench
   deallocOpenMPDeviceData(D, did);
 
 
-void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid)
+void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -53,9 +53,9 @@ void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid)
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-      
+
       #pragma omp target is_device_ptr(tmp,A,B) device( did )
-      #pragma omp teams distribute parallel for schedule(static, 1) collapse(2) 
+      #pragma omp teams distribute parallel for schedule(static, 1) collapse(2)
       for (Index_type i = 0; i < ni; i++ ) {
         for(Index_type j = 0; j < nj; j++) {
           POLYBENCH_2MM_BODY1;
@@ -75,11 +75,11 @@ void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid)
             POLYBENCH_2MM_BODY5;
           }
           POLYBENCH_2MM_BODY6;
-        }  
+        }
       }
 
     }
-    stopTimer(); 
+    stopTimer();
 
     POLYBENCH_2MM_DATA_TEARDOWN_OMP_TARGET;
 
@@ -121,7 +121,7 @@ void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid)
         }
       );
 
-      RAJA::kernel_param<EXEC_POL>( 
+      RAJA::kernel_param<EXEC_POL>(
         RAJA::make_tuple(RAJA::RangeSegment{0, ni},
                          RAJA::RangeSegment{0, nl},
                          RAJA::RangeSegment{0, nj}),
@@ -144,7 +144,7 @@ void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid)
     POLYBENCH_2MM_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  POLYBENCH_2MM : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  POLYBENCH_2MM : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/polybench/POLYBENCH_2MM-Seq.cpp b/src/polybench/POLYBENCH_2MM-Seq.cpp
index 36e70e2bc..6e59576b1 100644
--- a/src/polybench/POLYBENCH_2MM-Seq.cpp
+++ b/src/polybench/POLYBENCH_2MM-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,12 +13,12 @@
 #include <iostream>
 
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
-void POLYBENCH_2MM::runSeqVariant(VariantID vid)
+void POLYBENCH_2MM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps= getRunReps();
 
@@ -31,7 +31,7 @@ void POLYBENCH_2MM::runSeqVariant(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        for (Index_type i = 0; i < ni; i++ ) { 
+        for (Index_type i = 0; i < ni; i++ ) {
           for (Index_type j = 0; j < nj; j++) {
             POLYBENCH_2MM_BODY1;
             for (Index_type k = 0; k < nk; k++) {
@@ -114,7 +114,7 @@ void POLYBENCH_2MM::runSeqVariant(VariantID vid)
       auto poly_2mm_lam1 = [=](Real_type &dot) {
                              POLYBENCH_2MM_BODY1_RAJA;
                            };
-      auto poly_2mm_lam2 = [=](Index_type i, Index_type j, Index_type k, 
+      auto poly_2mm_lam2 = [=](Index_type i, Index_type j, Index_type k,
                                Real_type &dot) {
                              POLYBENCH_2MM_BODY2_RAJA;
                            };
@@ -125,7 +125,7 @@ void POLYBENCH_2MM::runSeqVariant(VariantID vid)
       auto poly_2mm_lam4 = [=](Real_type &dot) {
                              POLYBENCH_2MM_BODY4_RAJA;
                            };
-      auto poly_2mm_lam5 = [=](Index_type i, Index_type l, Index_type j, 
+      auto poly_2mm_lam5 = [=](Index_type i, Index_type l, Index_type j,
                                Real_type &dot) {
                              POLYBENCH_2MM_BODY5_RAJA;
                            };
@@ -150,25 +150,25 @@ void POLYBENCH_2MM::runSeqVariant(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::kernel_param<EXEC_POL>( 
+        RAJA::kernel_param<EXEC_POL>(
           RAJA::make_tuple(RAJA::RangeSegment{0, ni},
                            RAJA::RangeSegment{0, nj},
                            RAJA::RangeSegment{0, nk}),
           RAJA::tuple<Real_type>{0.0},
 
-          poly_2mm_lam1, 
-          poly_2mm_lam2, 
+          poly_2mm_lam1,
+          poly_2mm_lam2,
           poly_2mm_lam3
         );
 
-        RAJA::kernel_param<EXEC_POL>( 
+        RAJA::kernel_param<EXEC_POL>(
           RAJA::make_tuple(RAJA::RangeSegment{0, ni},
                            RAJA::RangeSegment{0, nl},
                            RAJA::RangeSegment{0, nj}),
           RAJA::tuple<Real_type>{0.0},
 
-          poly_2mm_lam4, 
-          poly_2mm_lam5, 
+          poly_2mm_lam4,
+          poly_2mm_lam5,
           poly_2mm_lam6
         );
 
@@ -180,7 +180,7 @@ void POLYBENCH_2MM::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  POLYBENCH_2MM : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_2MM : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp
index 7e2083c50..03119a863 100644
--- a/src/polybench/POLYBENCH_2MM.cpp
+++ b/src/polybench/POLYBENCH_2MM.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -27,7 +27,7 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params)
   Index_type nk_default = 1120;
   Index_type nl_default = 1000;
 
-  setDefaultProblemSize( std::max( ni_default*nj_default, 
+  setDefaultProblemSize( std::max( ni_default*nj_default,
                                    ni_default*nl_default ) );
   setDefaultReps(2);
 
@@ -54,10 +54,10 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params)
   setFLOPsPerRep(3 * m_ni*m_nj*m_nk +
                  2 * m_ni*m_nj*m_nl );
 
-  checksum_scale_factor = 0.000001 * 
+  checksum_scale_factor = 0.000001 *
               ( static_cast<Checksum_type>(getDefaultProblemSize()) /
                                            getActualProblemSize() );
-                                       
+
   setUsesFeature(Kernel);
 
   setVariantDefined( Base_Seq );
@@ -84,7 +84,7 @@ POLYBENCH_2MM::~POLYBENCH_2MM()
 {
 }
 
-void POLYBENCH_2MM::setUp(VariantID vid)
+void POLYBENCH_2MM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   allocAndInitData(m_tmp, m_ni * m_nj, vid);
@@ -94,12 +94,12 @@ void POLYBENCH_2MM::setUp(VariantID vid)
   allocAndInitDataConst(m_D, m_ni * m_nl, 0.0, vid);
 }
 
-void POLYBENCH_2MM::updateChecksum(VariantID vid)
+void POLYBENCH_2MM::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_D, m_ni * m_nl, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_D, m_ni * m_nl, checksum_scale_factor );
 }
 
-void POLYBENCH_2MM::tearDown(VariantID vid)
+void POLYBENCH_2MM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_tmp);
diff --git a/src/polybench/POLYBENCH_2MM.hpp b/src/polybench/POLYBENCH_2MM.hpp
index 897eb13a3..0624257f7 100644
--- a/src/polybench/POLYBENCH_2MM.hpp
+++ b/src/polybench/POLYBENCH_2MM.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -118,17 +118,28 @@ class POLYBENCH_2MM : public KernelBase
 
   ~POLYBENCH_2MM();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
+                                                         gpu_block_size::MultipleOf<32>>;
+
   Index_type m_ni;
   Index_type m_nj;
   Index_type m_nk;
diff --git a/src/polybench/POLYBENCH_3MM-Cuda.cpp b/src/polybench/POLYBENCH_3MM-Cuda.cpp
index 956efb427..f9b151ebf 100644
--- a/src/polybench/POLYBENCH_3MM-Cuda.cpp
+++ b/src/polybench/POLYBENCH_3MM-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,13 +22,16 @@ namespace polybench
 {
 
 //
-// Define thread block size for CUDA execution
+// Define thread block shape for CUDA execution
 //
-constexpr size_t out_block_sz = 8;
-constexpr size_t in_block_sz = 32;
+#define in_block_sz (32)
+#define out_block_sz (block_size / in_block_sz)
+
+#define POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \
+  in_block_sz, out_block_sz
 
 #define POLY_3MM_THREADS_PER_BLOCK_CUDA \
-  dim3 nthreads_per_block(in_block_sz, out_block_sz, 1);
+  dim3 nthreads_per_block(POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA, 1);
 
 #define POLY_3MM_1_NBLOCKS_CUDA \
   dim3 nblocks1(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(nj, in_block_sz)), \
@@ -66,11 +69,13 @@ constexpr size_t in_block_sz = 32;
   deallocCudaDeviceData(F); \
   deallocCudaDeviceData(G);
 
+template < size_t in_block_size, size_t out_block_size >
+__launch_bounds__(in_block_size*out_block_size)
 __global__ void poly_3mm_1(Real_ptr E, Real_ptr A, Real_ptr B,
                            Index_type ni, Index_type nj, Index_type nk)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * out_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * in_block_size + threadIdx.x;
 
   if ( i < ni && j < nj ) {
     POLYBENCH_3MM_BODY1;
@@ -81,23 +86,26 @@ __global__ void poly_3mm_1(Real_ptr E, Real_ptr A, Real_ptr B,
   }
 }
 
-template< typename Lambda >
+template < size_t in_block_size, size_t out_block_size, typename Lambda >
+__launch_bounds__(in_block_size*out_block_size)
 __global__ void poly_3mm_1_lam(Index_type ni, Index_type nj,
                                Lambda body)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * out_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * in_block_size + threadIdx.x;
 
   if ( i < ni && j < nj ) {
     body(i, j);
   }
 }
 
+template < size_t in_block_size, size_t out_block_size >
+__launch_bounds__(in_block_size*out_block_size)
 __global__ void poly_3mm_2(Real_ptr F, Real_ptr C, Real_ptr D,
                            Index_type nj, Index_type nl, Index_type nm)
 {
-  Index_type j = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type l = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type j = blockIdx.y * out_block_size + threadIdx.y;
+  Index_type l = blockIdx.x * in_block_size + threadIdx.x;
 
   if ( j < nj && l < nl ) {
     POLYBENCH_3MM_BODY4;
@@ -108,23 +116,26 @@ __global__ void poly_3mm_2(Real_ptr F, Real_ptr C, Real_ptr D,
   }
 }
 
-template< typename Lambda >
+template < size_t in_block_size, size_t out_block_size, typename Lambda >
+__launch_bounds__(in_block_size*out_block_size)
 __global__ void poly_3mm_2_lam(Index_type nj, Index_type nl,
                                Lambda body)
 {
-  Index_type j = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type l = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type j = blockIdx.y * out_block_size + threadIdx.y;
+  Index_type l = blockIdx.x * in_block_size + threadIdx.x;
 
   if ( j < nj && l < nl ) {
     body(j, l);
   }
 }
 
+template < size_t in_block_size, size_t out_block_size >
+__launch_bounds__(in_block_size*out_block_size)
 __global__ void poly_3mm_3(Real_ptr G, Real_ptr E, Real_ptr F,
                            Index_type ni, Index_type nl, Index_type nj)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type l = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * out_block_size + threadIdx.y;
+  Index_type l = blockIdx.x * in_block_size + threadIdx.x;
 
   if ( i < ni && l < nl ) {
     POLYBENCH_3MM_BODY7;
@@ -135,12 +146,13 @@ __global__ void poly_3mm_3(Real_ptr G, Real_ptr E, Real_ptr F,
   }
 }
 
-template< typename Lambda >
+template < size_t in_block_size, size_t out_block_size, typename Lambda >
+__launch_bounds__(in_block_size*out_block_size)
 __global__ void poly_3mm_3_lam(Index_type ni, Index_type nl,
                                Lambda body)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type l = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * out_block_size + threadIdx.y;
+  Index_type l = blockIdx.x * in_block_size + threadIdx.x;
 
   if ( i < ni && l < nl ) {
     body(i, l);
@@ -149,7 +161,8 @@ __global__ void poly_3mm_3_lam(Index_type ni, Index_type nl,
 
 
 
-void POLYBENCH_3MM::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -165,17 +178,20 @@ void POLYBENCH_3MM::runCudaVariant(VariantID vid)
       POLY_3MM_THREADS_PER_BLOCK_CUDA;
 
       POLY_3MM_1_NBLOCKS_CUDA;
-      poly_3mm_1<<<nblocks1, nthreads_per_block>>>(E, A, B,
+      poly_3mm_1<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                <<<nblocks1, nthreads_per_block>>>(E, A, B,
                                                    ni, nj, nk);
       cudaErrchk( cudaGetLastError() );
 
       POLY_3MM_2_NBLOCKS_CUDA;
-      poly_3mm_2<<<nblocks2, nthreads_per_block>>>(F, C, D,
+      poly_3mm_2<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                <<<nblocks2, nthreads_per_block>>>(F, C, D,
                                                    nj, nl, nm);
       cudaErrchk( cudaGetLastError() );
 
       POLY_3MM_3_NBLOCKS_CUDA;
-      poly_3mm_3<<<nblocks3, nthreads_per_block>>>(G, E, F,
+      poly_3mm_3<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                <<<nblocks3, nthreads_per_block>>>(G, E, F,
                                                    ni, nl, nj);
       cudaErrchk( cudaGetLastError() );
 
@@ -194,7 +210,8 @@ void POLYBENCH_3MM::runCudaVariant(VariantID vid)
       POLY_3MM_THREADS_PER_BLOCK_CUDA;
 
       POLY_3MM_1_NBLOCKS_CUDA;
-      poly_3mm_1_lam<<<nblocks1, nthreads_per_block>>>(ni, nj,
+      poly_3mm_1_lam<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                    <<<nblocks1, nthreads_per_block>>>(ni, nj,
         [=] __device__ (Index_type i, Index_type j) {
           POLYBENCH_3MM_BODY1;
           for (Index_type k=0; k < nk; ++k) {
@@ -206,7 +223,8 @@ void POLYBENCH_3MM::runCudaVariant(VariantID vid)
       cudaErrchk( cudaGetLastError() );
 
       POLY_3MM_2_NBLOCKS_CUDA;
-      poly_3mm_2_lam<<<nblocks2, nthreads_per_block>>>(nj, nl,
+      poly_3mm_2_lam<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                    <<<nblocks2, nthreads_per_block>>>(nj, nl,
         [=] __device__ (Index_type j, Index_type l) {
           POLYBENCH_3MM_BODY4;
           for (Index_type m=0; m < nm; ++m) {
@@ -218,7 +236,8 @@ void POLYBENCH_3MM::runCudaVariant(VariantID vid)
       cudaErrchk( cudaGetLastError() );
 
       POLY_3MM_3_NBLOCKS_CUDA;
-      poly_3mm_3_lam<<<nblocks3, nthreads_per_block>>>(ni, nl,
+      poly_3mm_3_lam<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                    <<<nblocks3, nthreads_per_block>>>(ni, nl,
         [=] __device__ (Index_type i, Index_type l) {
           POLYBENCH_3MM_BODY7;
           for (Index_type j=0; j < nj; ++j) {
@@ -330,11 +349,12 @@ void POLYBENCH_3MM::runCudaVariant(VariantID vid)
     POLYBENCH_3MM_TEARDOWN_CUDA;
 
   } else {
-      std::cout << "\n  POLYBENCH_3MM : Unknown Cuda variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_3MM : Unknown Cuda variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_3MM, Cuda)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_3MM-Hip.cpp b/src/polybench/POLYBENCH_3MM-Hip.cpp
index 51e8ac53f..4199f0c44 100644
--- a/src/polybench/POLYBENCH_3MM-Hip.cpp
+++ b/src/polybench/POLYBENCH_3MM-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,13 +22,16 @@ namespace polybench
 {
 
 //
-// Define thread block size for Hip execution
+// Define thread block shape for Hip execution
 //
-constexpr size_t out_block_sz = 8;
-constexpr size_t in_block_sz = 32;
+#define in_block_sz (32)
+#define out_block_sz (block_size / in_block_sz)
+
+#define POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \
+  in_block_sz, out_block_sz
 
 #define POLY_3MM_THREADS_PER_BLOCK_HIP \
-  dim3 nthreads_per_block(in_block_sz, out_block_sz, 1);
+  dim3 nthreads_per_block(POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, 1);
 
 #define POLY_3MM_1_NBLOCKS_HIP \
   dim3 nblocks1(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(nj, in_block_sz)), \
@@ -66,11 +69,13 @@ constexpr size_t in_block_sz = 32;
   deallocHipDeviceData(F); \
   deallocHipDeviceData(G);
 
+template < size_t in_block_size, size_t out_block_size >
+__launch_bounds__(in_block_size*out_block_size)
 __global__ void poly_3mm_1(Real_ptr E, Real_ptr A, Real_ptr B,
                            Index_type ni, Index_type nj, Index_type nk)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * out_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * in_block_size + threadIdx.x;
 
   if ( i < ni && j < nj ) {
     POLYBENCH_3MM_BODY1;
@@ -81,23 +86,26 @@ __global__ void poly_3mm_1(Real_ptr E, Real_ptr A, Real_ptr B,
   }
 }
 
-template< typename Lambda >
+template < size_t in_block_size, size_t out_block_size, typename Lambda >
+__launch_bounds__(in_block_size*out_block_size)
 __global__ void poly_3mm_1_lam(Index_type ni, Index_type nj,
                                Lambda body)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * out_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * in_block_size + threadIdx.x;
 
   if ( i < ni && j < nj ) {
     body(i, j);
   }
 }
 
+template < size_t in_block_size, size_t out_block_size >
+__launch_bounds__(in_block_size*out_block_size)
 __global__ void poly_3mm_2(Real_ptr F, Real_ptr C, Real_ptr D,
                            Index_type nj, Index_type nl, Index_type nm)
 {
-  Index_type j = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type l = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type j = blockIdx.y * out_block_size + threadIdx.y;
+  Index_type l = blockIdx.x * in_block_size + threadIdx.x;
 
   if ( j < nj && l < nl ) {
     POLYBENCH_3MM_BODY4;
@@ -108,23 +116,26 @@ __global__ void poly_3mm_2(Real_ptr F, Real_ptr C, Real_ptr D,
   }
 }
 
-template< typename Lambda >
+template < size_t in_block_size, size_t out_block_size, typename Lambda >
+__launch_bounds__(in_block_size*out_block_size)
 __global__ void poly_3mm_2_lam(Index_type nj, Index_type nl,
                                Lambda body)
 {
-  Index_type j = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type l = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type j = blockIdx.y * out_block_size + threadIdx.y;
+  Index_type l = blockIdx.x * in_block_size + threadIdx.x;
 
   if ( j < nj && l < nl ) {
     body(j, l);
   }
 }
 
+template < size_t in_block_size, size_t out_block_size >
+__launch_bounds__(in_block_size*out_block_size)
 __global__ void poly_3mm_3(Real_ptr G, Real_ptr E, Real_ptr F,
                            Index_type ni, Index_type nl, Index_type nj)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type l = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * out_block_size + threadIdx.y;
+  Index_type l = blockIdx.x * in_block_size + threadIdx.x;
 
   if ( i < ni && l < nl ) {
     POLYBENCH_3MM_BODY7;
@@ -135,12 +146,13 @@ __global__ void poly_3mm_3(Real_ptr G, Real_ptr E, Real_ptr F,
   }
 }
 
-template< typename Lambda >
+template < size_t in_block_size, size_t out_block_size, typename Lambda >
+__launch_bounds__(in_block_size*out_block_size)
 __global__ void poly_3mm_3_lam(Index_type ni, Index_type nl,
                                Lambda body)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type l = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * out_block_size + threadIdx.y;
+  Index_type l = blockIdx.x * in_block_size + threadIdx.x;
 
   if ( i < ni && l < nl ) {
     body(i, l);
@@ -148,7 +160,8 @@ __global__ void poly_3mm_3_lam(Index_type ni, Index_type nl,
 }
 
 
-void POLYBENCH_3MM::runHipVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_3MM::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -163,22 +176,22 @@ void POLYBENCH_3MM::runHipVariant(VariantID vid)
 
       POLY_3MM_THREADS_PER_BLOCK_HIP;
 
-      POLY_3MM_1_NBLOCKS_HIP;     
-      hipLaunchKernelGGL((poly_3mm_1), 
+      POLY_3MM_1_NBLOCKS_HIP;
+      hipLaunchKernelGGL((poly_3mm_1<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
                          dim3(nblocks1) , dim3(nthreads_per_block), 0, 0,
                          E, A, B,
                          ni, nj, nk);
       hipErrchk( hipGetLastError() );
 
       POLY_3MM_2_NBLOCKS_HIP;
-      hipLaunchKernelGGL((poly_3mm_2), 
+      hipLaunchKernelGGL((poly_3mm_2<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
                          dim3(nblocks2), dim3(nthreads_per_block), 0, 0,
                          F, C, D,
                          nj, nl, nm);
       hipErrchk( hipGetLastError() );
 
       POLY_3MM_3_NBLOCKS_HIP;
-      hipLaunchKernelGGL((poly_3mm_3), 
+      hipLaunchKernelGGL((poly_3mm_3<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
                          dim3(nblocks3), dim3(nthreads_per_block), 0, 0,
                          G, E, F,
                          ni, nl, nj);
@@ -207,7 +220,7 @@ void POLYBENCH_3MM::runHipVariant(VariantID vid)
       };
 
       POLY_3MM_1_NBLOCKS_HIP;
-      hipLaunchKernelGGL((poly_3mm_1_lam<decltype(poly_3mm_1_lambda)>),
+      hipLaunchKernelGGL((poly_3mm_1_lam<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_3mm_1_lambda)>),
                          dim3(nblocks1), dim3(nthreads_per_block), 0, 0,
                          ni, nj, poly_3mm_1_lambda);
       hipErrchk( hipGetLastError() );
@@ -221,7 +234,7 @@ void POLYBENCH_3MM::runHipVariant(VariantID vid)
       };
 
       POLY_3MM_2_NBLOCKS_HIP;
-      hipLaunchKernelGGL((poly_3mm_2_lam<decltype(poly_3mm_2_lambda)>),
+      hipLaunchKernelGGL((poly_3mm_2_lam<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_3mm_2_lambda)>),
                          dim3(nblocks2), dim3(nthreads_per_block), 0, 0,
                          nj, nl, poly_3mm_2_lambda);
       hipErrchk( hipGetLastError() );
@@ -235,7 +248,7 @@ void POLYBENCH_3MM::runHipVariant(VariantID vid)
       };
 
       POLY_3MM_3_NBLOCKS_HIP;
-      hipLaunchKernelGGL((poly_3mm_3_lam<decltype(poly_3mm_3_lambda)>),
+      hipLaunchKernelGGL((poly_3mm_3_lam<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_3mm_3_lambda)>),
                          dim3(nblocks3), dim3(nthreads_per_block), 0, 0,
                          ni, nl, poly_3mm_3_lambda);
       hipErrchk( hipGetLastError() );
@@ -270,7 +283,7 @@ void POLYBENCH_3MM::runHipVariant(VariantID vid)
             >
           >
         >
-      >; 
+      >;
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -341,11 +354,12 @@ void POLYBENCH_3MM::runHipVariant(VariantID vid)
     POLYBENCH_3MM_TEARDOWN_HIP;
 
   } else {
-      std::cout << "\n  POLYBENCH_3MM : Unknown Hip variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_3MM : Unknown Hip variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_3MM, Hip)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_3MM-OMP.cpp b/src/polybench/POLYBENCH_3MM-OMP.cpp
index 5c8a595b7..a45f4dd28 100644
--- a/src/polybench/POLYBENCH_3MM-OMP.cpp
+++ b/src/polybench/POLYBENCH_3MM-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,13 +21,13 @@
 //#undef USE_RAJA_OMP_COLLAPSE
 
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
-  
-void POLYBENCH_3MM::runOpenMPVariant(VariantID vid)
+
+void POLYBENCH_3MM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -45,7 +45,7 @@ void POLYBENCH_3MM::runOpenMPVariant(VariantID vid)
 #if defined(USE_OMP_COLLAPSE)
         #pragma omp parallel for collapse(2)
 #else
-        #pragma omp parallel for  
+        #pragma omp parallel for
 #endif
         for (Index_type i = 0; i < ni; i++ )  {
           for (Index_type j = 0; j < nj; j++) {
@@ -60,7 +60,7 @@ void POLYBENCH_3MM::runOpenMPVariant(VariantID vid)
 #if defined(USE_OMP_COLLAPSE)
         #pragma omp parallel for collapse(2)
 #else
-        #pragma omp parallel for  
+        #pragma omp parallel for
 #endif
         for (Index_type j = 0; j < nj; j++) {
           for (Index_type l = 0; l < nl; l++) {
@@ -75,7 +75,7 @@ void POLYBENCH_3MM::runOpenMPVariant(VariantID vid)
 #if defined(USE_OMP_COLLAPSE)
         #pragma omp parallel for collapse(2)
 #else
-        #pragma omp parallel for  
+        #pragma omp parallel for
 #endif
         for (Index_type i = 0; i < ni; i++) {
           for (Index_type l = 0; l < nl; l++) {
@@ -181,7 +181,7 @@ void POLYBENCH_3MM::runOpenMPVariant(VariantID vid)
       auto poly_3mm_lam1 = [=] (Real_type &dot) {
                                   POLYBENCH_3MM_BODY1_RAJA;
                                 };
-      auto poly_3mm_lam2 = [=] (Index_type i, Index_type j, Index_type k, 
+      auto poly_3mm_lam2 = [=] (Index_type i, Index_type j, Index_type k,
                                 Real_type &dot) {
                                   POLYBENCH_3MM_BODY2_RAJA;
                                 };
@@ -192,7 +192,7 @@ void POLYBENCH_3MM::runOpenMPVariant(VariantID vid)
       auto poly_3mm_lam4 = [=] (Real_type &dot) {
                                   POLYBENCH_3MM_BODY4_RAJA;
                                 };
-      auto poly_3mm_lam5 = [=] (Index_type j, Index_type l, Index_type m, 
+      auto poly_3mm_lam5 = [=] (Index_type j, Index_type l, Index_type m,
                                 Real_type &dot) {
                                   POLYBENCH_3MM_BODY5_RAJA;
                                 };
@@ -203,7 +203,7 @@ void POLYBENCH_3MM::runOpenMPVariant(VariantID vid)
       auto poly_3mm_lam7 = [=] (Real_type &dot) {
                                   POLYBENCH_3MM_BODY7_RAJA;
                                 };
-      auto poly_3mm_lam8 = [=] (Index_type i, Index_type l, Index_type j, 
+      auto poly_3mm_lam8 = [=] (Index_type i, Index_type l, Index_type j,
                                 Real_type &dot) {
                                   POLYBENCH_3MM_BODY8_RAJA;
                                 };
@@ -285,12 +285,12 @@ void POLYBENCH_3MM::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  POLYBENCH_2MM : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_2MM : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp
index db965fc18..21c1ce7fa 100644
--- a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -31,7 +31,7 @@ namespace polybench
   allocAndInitOpenMPDeviceData(D, m_D, m_nm * m_nl, did, hid); \
   allocAndInitOpenMPDeviceData(E, m_E, m_ni * m_nj, did, hid); \
   allocAndInitOpenMPDeviceData(F, m_F, m_nj * m_nl, did, hid); \
-  allocAndInitOpenMPDeviceData(G, m_G, m_ni * m_nl, did, hid); 
+  allocAndInitOpenMPDeviceData(G, m_G, m_ni * m_nl, did, hid);
 
 
 #define POLYBENCH_3MM_DATA_TEARDOWN_OMP_TARGET \
@@ -44,7 +44,7 @@ namespace polybench
   deallocOpenMPDeviceData(F, did); \
   deallocOpenMPDeviceData(G, did);
 
-void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid)
+void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -56,7 +56,7 @@ void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid)
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-    
+
       #pragma omp target is_device_ptr(A,B,E) device( did )
       #pragma omp teams distribute parallel for schedule(static, 1) collapse(2)
       for (Index_type i = 0; i < ni; i++ ) {
@@ -94,7 +94,7 @@ void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid)
       }
 
     }
-    stopTimer(); 
+    stopTimer();
 
     POLYBENCH_3MM_DATA_TEARDOWN_OMP_TARGET;
 
@@ -107,7 +107,7 @@ void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid)
     using EXEC_POL =
       RAJA::KernelPolicy<
         RAJA::statement::Collapse<RAJA::omp_target_parallel_collapse_exec,
-                                  RAJA::ArgList<0, 1>, 
+                                  RAJA::ArgList<0, 1>,
           RAJA::statement::Lambda<0, RAJA::Params<0>>,
           RAJA::statement::For<2, RAJA::seq_exec,
             RAJA::statement::Lambda<1, RAJA::Segs<0,1,2>, RAJA::Params<0>>
@@ -118,8 +118,8 @@ void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid)
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-    
-      RAJA::kernel_param<EXEC_POL>( 
+
+      RAJA::kernel_param<EXEC_POL>(
         RAJA::make_tuple(RAJA::RangeSegment{0, ni},
                          RAJA::RangeSegment{0, nj},
                          RAJA::RangeSegment{0, nk}),
@@ -128,18 +128,18 @@ void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid)
         [=] (Real_type &dot) {
           POLYBENCH_3MM_BODY1_RAJA;
         },
-        [=] (Index_type i, Index_type j, Index_type k, 
+        [=] (Index_type i, Index_type j, Index_type k,
              Real_type &dot) {
           POLYBENCH_3MM_BODY2_RAJA;
         },
-        [=] (Index_type i, Index_type j, 
+        [=] (Index_type i, Index_type j,
              Real_type &dot) {
           POLYBENCH_3MM_BODY3_RAJA;
         }
 
       );
 
-      RAJA::kernel_param<EXEC_POL>( 
+      RAJA::kernel_param<EXEC_POL>(
         RAJA::make_tuple(RAJA::RangeSegment{0, nj},
                          RAJA::RangeSegment{0, nl},
                          RAJA::RangeSegment{0, nm}),
@@ -148,18 +148,18 @@ void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid)
         [=] (Real_type &dot) {
           POLYBENCH_3MM_BODY4_RAJA;
         },
-        [=] (Index_type j, Index_type l, Index_type m, 
+        [=] (Index_type j, Index_type l, Index_type m,
              Real_type &dot) {
           POLYBENCH_3MM_BODY5_RAJA;
         },
-        [=] (Index_type j, Index_type l, 
+        [=] (Index_type j, Index_type l,
              Real_type &dot) {
           POLYBENCH_3MM_BODY6_RAJA;
         }
 
       );
 
-      RAJA::kernel_param<EXEC_POL>( 
+      RAJA::kernel_param<EXEC_POL>(
         RAJA::make_tuple(RAJA::RangeSegment{0, ni},
                          RAJA::RangeSegment{0, nl},
                          RAJA::RangeSegment{0, nj}),
@@ -168,16 +168,16 @@ void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid)
         [=] (Real_type &dot) {
           POLYBENCH_3MM_BODY7_RAJA;
         },
-        [=] (Index_type i, Index_type l, Index_type j, 
+        [=] (Index_type i, Index_type l, Index_type j,
              Real_type &dot) {
           POLYBENCH_3MM_BODY8_RAJA;
         },
-        [=] (Index_type i, Index_type l, 
+        [=] (Index_type i, Index_type l,
              Real_type &dot) {
           POLYBENCH_3MM_BODY9_RAJA;
         }
 
-      ); 
+      );
 
     }
     stopTimer();
@@ -185,7 +185,7 @@ void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid)
     POLYBENCH_3MM_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  POLYBENCH_3MM : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  POLYBENCH_3MM : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/polybench/POLYBENCH_3MM-Seq.cpp b/src/polybench/POLYBENCH_3MM-Seq.cpp
index 6af320fd8..c1ca8c56d 100644
--- a/src/polybench/POLYBENCH_3MM-Seq.cpp
+++ b/src/polybench/POLYBENCH_3MM-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -14,13 +14,13 @@
 #include <cstring>
 
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
-  
-void POLYBENCH_3MM::runSeqVariant(VariantID vid)
+
+void POLYBENCH_3MM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -143,7 +143,7 @@ void POLYBENCH_3MM::runSeqVariant(VariantID vid)
       auto poly_3mm_lam1 = [=] (Real_type &dot) {
                                   POLYBENCH_3MM_BODY1_RAJA;
                                 };
-      auto poly_3mm_lam2 = [=] (Index_type i, Index_type j, Index_type k, 
+      auto poly_3mm_lam2 = [=] (Index_type i, Index_type j, Index_type k,
                                 Real_type &dot) {
                                   POLYBENCH_3MM_BODY2_RAJA;
                                 };
@@ -154,7 +154,7 @@ void POLYBENCH_3MM::runSeqVariant(VariantID vid)
       auto poly_3mm_lam4 = [=] (Real_type &dot) {
                                   POLYBENCH_3MM_BODY4_RAJA;
                                 };
-      auto poly_3mm_lam5 = [=] (Index_type j, Index_type l, Index_type m, 
+      auto poly_3mm_lam5 = [=] (Index_type j, Index_type l, Index_type m,
                                 Real_type &dot) {
                                   POLYBENCH_3MM_BODY5_RAJA;
                                 };
@@ -165,7 +165,7 @@ void POLYBENCH_3MM::runSeqVariant(VariantID vid)
       auto poly_3mm_lam7 = [=] (Real_type &dot) {
                                   POLYBENCH_3MM_BODY7_RAJA;
                                 };
-      auto poly_3mm_lam8 = [=] (Index_type i, Index_type l, Index_type j, 
+      auto poly_3mm_lam8 = [=] (Index_type i, Index_type l, Index_type j,
                                 Real_type &dot) {
                                   POLYBENCH_3MM_BODY8_RAJA;
                                 };
@@ -212,7 +212,7 @@ void POLYBENCH_3MM::runSeqVariant(VariantID vid)
           poly_3mm_lam5,
           poly_3mm_lam6
 
-        ); 
+        );
 
         RAJA::kernel_param<EXEC_POL>(
           RAJA::make_tuple(RAJA::RangeSegment{0, ni},
@@ -234,7 +234,7 @@ void POLYBENCH_3MM::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  POLYBENCH_2MM : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_2MM : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp
index 2c06a72ac..75990394c 100644
--- a/src/polybench/POLYBENCH_3MM.cpp
+++ b/src/polybench/POLYBENCH_3MM.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -29,8 +29,8 @@ POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params)
   Index_type nl_default = 1000;
   Index_type nm_default = 1200;
 
-  setDefaultProblemSize( std::max( std::max( ni_default*nj_default, 
-                                             nj_default*nl_default ), 
+  setDefaultProblemSize( std::max( std::max( ni_default*nj_default,
+                                             nj_default*nl_default ),
                                   ni_default*nl_default ) );
   setDefaultProblemSize( ni_default * nj_default );
   setDefaultReps(2);
@@ -42,7 +42,7 @@ POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params)
   m_nm = nm_default;
 
 
-  setActualProblemSize( std::max( std::max( m_ni*m_nj, m_nj*m_nl ), 
+  setActualProblemSize( std::max( std::max( m_ni*m_nj, m_nj*m_nl ),
                                   m_ni*m_nl ) );
 
   setItsPerRep( m_ni*m_nj + m_nj*m_nl + m_ni*m_nl );
@@ -62,7 +62,7 @@ POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params)
                  2 * m_nj*m_nl*m_nm +
                  2 * m_ni*m_nj*m_nl );
 
-  checksum_scale_factor = 0.000000001 * 
+  checksum_scale_factor = 0.000000001 *
               ( static_cast<Checksum_type>(getDefaultProblemSize()) /
                                            getActualProblemSize() );
 
@@ -92,7 +92,7 @@ POLYBENCH_3MM::~POLYBENCH_3MM()
 {
 }
 
-void POLYBENCH_3MM::setUp(VariantID vid)
+void POLYBENCH_3MM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   allocAndInitData(m_A, m_ni * m_nk, vid);
@@ -104,12 +104,12 @@ void POLYBENCH_3MM::setUp(VariantID vid)
   allocAndInitDataConst(m_G, m_ni * m_nl, 0.0, vid);
 }
 
-void POLYBENCH_3MM::updateChecksum(VariantID vid)
+void POLYBENCH_3MM::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_G, m_ni * m_nl, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_G, m_ni * m_nl, checksum_scale_factor );
 }
 
-void POLYBENCH_3MM::tearDown(VariantID vid)
+void POLYBENCH_3MM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_A);
diff --git a/src/polybench/POLYBENCH_3MM.hpp b/src/polybench/POLYBENCH_3MM.hpp
index 80d0a2fe5..0cf9aabff 100644
--- a/src/polybench/POLYBENCH_3MM.hpp
+++ b/src/polybench/POLYBENCH_3MM.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -144,17 +144,28 @@ class POLYBENCH_3MM : public KernelBase
 
   ~POLYBENCH_3MM();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
+                                                         gpu_block_size::MultipleOf<32>>;
+
   Index_type m_ni;
   Index_type m_nj;
   Index_type m_nk;
diff --git a/src/polybench/POLYBENCH_ADI-Cuda.cpp b/src/polybench/POLYBENCH_ADI-Cuda.cpp
index 57500408d..a4f92f213 100644
--- a/src/polybench/POLYBENCH_ADI-Cuda.cpp
+++ b/src/polybench/POLYBENCH_ADI-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,11 +21,6 @@ namespace rajaperf
 namespace polybench
 {
 
-//
-// Define thread block size for CUDA execution
-//
-const size_t block_size = 256;
-
 #define POLYBENCH_ADI_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(U, m_U, m_n * m_n); \
   allocAndInitCudaDeviceData(V, m_V, m_n * m_n); \
@@ -40,12 +35,14 @@ const size_t block_size = 256;
   deallocCudaDeviceData(Q);
 
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void adi1(const Index_type n,
                      const Real_type a, const Real_type b, const Real_type c,
                      const Real_type d, const Real_type f,
                      Real_ptr P, Real_ptr Q, Real_ptr U, Real_ptr V)
 {
-  Index_type i = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = 1 + blockIdx.x * block_size + threadIdx.x;
   if (i < n-1) {
     POLYBENCH_ADI_BODY2;
     for (Index_type j = 1; j < n-1; ++j) {
@@ -58,12 +55,14 @@ __global__ void adi1(const Index_type n,
   }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void adi2(const Index_type n,
                      const Real_type a, const Real_type c, const Real_type d,
                      const Real_type e, const Real_type f,
                      Real_ptr P, Real_ptr Q, Real_ptr U, Real_ptr V)
 {
-  Index_type i = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = 1 + blockIdx.x * block_size + threadIdx.x;
   if (i < n-1) {
     POLYBENCH_ADI_BODY6;
     for (Index_type j = 1; j < n-1; ++j) {
@@ -76,18 +75,20 @@ __global__ void adi2(const Index_type n,
   }
 }
 
-template< typename Lambda >
+template < size_t block_size, typename Lambda >
+__launch_bounds__(block_size)
 __global__ void adi_lam(const Index_type n,
                         Lambda body)
 {
-  Index_type i = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = 1 + blockIdx.x * block_size + threadIdx.x;
   if (i < n-1) {
     body(i);
   }
 }
 
 
-void POLYBENCH_ADI::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -104,12 +105,12 @@ void POLYBENCH_ADI::runCudaVariant(VariantID vid)
 
         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(n-2, block_size);
 
-        adi1<<<grid_size, block_size>>>(n,
+        adi1<block_size><<<grid_size, block_size>>>(n,
                                         a, b, c, d, f,
                                         P, Q, U, V);
         cudaErrchk( cudaGetLastError() );
 
-        adi2<<<grid_size, block_size>>>(n,
+        adi2<block_size><<<grid_size, block_size>>>(n,
                                         a, c, d, e, f,
                                         P, Q, U, V);
         cudaErrchk( cudaGetLastError() );
@@ -132,7 +133,7 @@ void POLYBENCH_ADI::runCudaVariant(VariantID vid)
 
         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(n-2, block_size);
 
-        adi_lam<<<grid_size, block_size>>>(n,
+        adi_lam<block_size><<<grid_size, block_size>>>(n,
           [=] __device__ (Index_type i) {
             POLYBENCH_ADI_BODY2;
             for (Index_type j = 1; j < n-1; ++j) {
@@ -146,7 +147,7 @@ void POLYBENCH_ADI::runCudaVariant(VariantID vid)
         );
         cudaErrchk( cudaGetLastError() );
 
-        adi_lam<<<grid_size, block_size>>>(n,
+        adi_lam<block_size><<<grid_size, block_size>>>(n,
           [=] __device__ (Index_type i) {
             POLYBENCH_ADI_BODY6;
             for (Index_type j = 1; j < n-1; ++j) {
@@ -243,10 +244,12 @@ void POLYBENCH_ADI::runCudaVariant(VariantID vid)
     POLYBENCH_ADI_TEARDOWN_CUDA
 
   } else {
-      std::cout << "\n  POLYBENCH_ADI : Unknown Cuda variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_ADI : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_ADI, Cuda)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_ADI-Hip.cpp b/src/polybench/POLYBENCH_ADI-Hip.cpp
index 9c65190a4..f87ec84f1 100644
--- a/src/polybench/POLYBENCH_ADI-Hip.cpp
+++ b/src/polybench/POLYBENCH_ADI-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,11 +21,6 @@ namespace rajaperf
 namespace polybench
 {
 
-//
-// Define thread block size for Hip execution
-//
-const size_t block_size = 256;
-
 #define POLYBENCH_ADI_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(U, m_U, m_n * m_n); \
   allocAndInitHipDeviceData(V, m_V, m_n * m_n); \
@@ -41,12 +36,14 @@ const size_t block_size = 256;
   deallocHipDeviceData(Q);
 
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void adi1(const Index_type n,
                      const Real_type a, const Real_type b, const Real_type c,
                      const Real_type d, const Real_type f,
                      Real_ptr P, Real_ptr Q, Real_ptr U, Real_ptr V)
 {
-  Index_type i = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = 1 + blockIdx.x * block_size + threadIdx.x;
   if (i < n-1) {
     POLYBENCH_ADI_BODY2;
     for (Index_type j = 1; j < n-1; ++j) {
@@ -59,12 +56,14 @@ __global__ void adi1(const Index_type n,
   }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void adi2(const Index_type n,
                      const Real_type a, const Real_type c, const Real_type d,
                      const Real_type e, const Real_type f,
                      Real_ptr P, Real_ptr Q, Real_ptr U, Real_ptr V)
 {
-  Index_type i = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = 1 + blockIdx.x * block_size + threadIdx.x;
   if (i < n-1) {
     POLYBENCH_ADI_BODY6;
     for (Index_type j = 1; j < n-1; ++j) {
@@ -77,18 +76,20 @@ __global__ void adi2(const Index_type n,
   }
 }
 
-template< typename Lambda >
+template < size_t block_size, typename Lambda >
+__launch_bounds__(block_size)
 __global__ void adi_lam(const Index_type n,
                         Lambda body)
 {
-  Index_type i = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = 1 + blockIdx.x * block_size + threadIdx.x;
   if (i < n-1) {
     body(i);
   }
 }
 
 
-void POLYBENCH_ADI::runHipVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_ADI::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -105,14 +106,14 @@ void POLYBENCH_ADI::runHipVariant(VariantID vid)
 
         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(n-2, block_size);
 
-        hipLaunchKernelGGL((adi1), 
+        hipLaunchKernelGGL((adi1<block_size>),
                            dim3(grid_size), dim3(block_size), 0, 0,
                            n,
                            a, b, c, d, f,
                            P, Q, U, V);
         hipErrchk( hipGetLastError() );
 
-        hipLaunchKernelGGL((adi2), 
+        hipLaunchKernelGGL((adi2<block_size>),
                            dim3(grid_size), dim3(block_size), 0, 0,
                            n,
                            a, c, d, e, f,
@@ -148,7 +149,7 @@ void POLYBENCH_ADI::runHipVariant(VariantID vid)
           }
         };
 
-        hipLaunchKernelGGL((adi_lam<decltype(adi1_lamda)>),
+        hipLaunchKernelGGL((adi_lam<block_size, decltype(adi1_lamda)>),
                            dim3(grid_size), dim3(block_size), 0, 0,
                            n, adi1_lamda);
         hipErrchk( hipGetLastError() );
@@ -164,7 +165,7 @@ void POLYBENCH_ADI::runHipVariant(VariantID vid)
           }
         };
 
-        hipLaunchKernelGGL((adi_lam<decltype(adi2_lamda)>),
+        hipLaunchKernelGGL((adi_lam<block_size, decltype(adi2_lamda)>),
                            dim3(grid_size), dim3(block_size), 0, 0,
                            n, adi2_lamda);
         hipErrchk( hipGetLastError() );
@@ -252,10 +253,12 @@ void POLYBENCH_ADI::runHipVariant(VariantID vid)
     POLYBENCH_ADI_TEARDOWN_HIP
 
   } else {
-      std::cout << "\n  POLYBENCH_ADI : Unknown Hip variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_ADI : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_ADI, Hip)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_ADI-OMP.cpp b/src/polybench/POLYBENCH_ADI-OMP.cpp
index 465237c43..a9409b182 100644
--- a/src/polybench/POLYBENCH_ADI-OMP.cpp
+++ b/src/polybench/POLYBENCH_ADI-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,13 +13,13 @@
 #include <iostream>
 #include <cstring>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
 
-void POLYBENCH_ADI::runOpenMPVariant(VariantID vid)
+void POLYBENCH_ADI::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -34,18 +34,18 @@ void POLYBENCH_ADI::runOpenMPVariant(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        for (Index_type t = 1; t <= tsteps; ++t) { 
+        for (Index_type t = 1; t <= tsteps; ++t) {
 
           #pragma omp parallel for
           for (Index_type i = 1; i < n-1; ++i) {
             POLYBENCH_ADI_BODY2;
             for (Index_type j = 1; j < n-1; ++j) {
               POLYBENCH_ADI_BODY3;
-            }  
+            }
             POLYBENCH_ADI_BODY4;
             for (Index_type k = n-2; k >= 1; --k) {
               POLYBENCH_ADI_BODY5;
-            }  
+            }
           }
 
           #pragma omp parallel for
@@ -57,7 +57,7 @@ void POLYBENCH_ADI::runOpenMPVariant(VariantID vid)
             POLYBENCH_ADI_BODY8;
             for (Index_type k = n-2; k >= 1; --k) {
               POLYBENCH_ADI_BODY9;
-            }  
+            }
           }
 
         }  // tstep loop
@@ -213,12 +213,12 @@ void POLYBENCH_ADI::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\nPOLYBENCH_ADI  Unknown variant id = " << vid << std::endl;
+      getCout() << "\nPOLYBENCH_ADI  Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/polybench/POLYBENCH_ADI-OMPTarget.cpp b/src/polybench/POLYBENCH_ADI-OMPTarget.cpp
index df935ee78..a3cc71346 100644
--- a/src/polybench/POLYBENCH_ADI-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_ADI-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -33,17 +33,17 @@ namespace polybench
   allocAndInitOpenMPDeviceData(U, m_U, m_n * m_n, did, hid); \
   allocAndInitOpenMPDeviceData(V, m_V, m_n * m_n, did, hid); \
   allocAndInitOpenMPDeviceData(P, m_P, m_n * m_n, did, hid); \
-  allocAndInitOpenMPDeviceData(Q, m_Q, m_n * m_n, did, hid); 
+  allocAndInitOpenMPDeviceData(Q, m_Q, m_n * m_n, did, hid);
 
 #define POLYBENCH_ADI_DATA_TEARDOWN_OMP_TARGET \
   getOpenMPDeviceData(m_U, U, m_n * m_n, hid, did); \
   deallocOpenMPDeviceData(U, did); \
   deallocOpenMPDeviceData(V, did); \
   deallocOpenMPDeviceData(P, did); \
-  deallocOpenMPDeviceData(Q, did); 
+  deallocOpenMPDeviceData(Q, did);
 
 
-void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid)
+void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -56,7 +56,7 @@ void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      for (Index_type t = 1; t <= tsteps; ++t) { 
+      for (Index_type t = 1; t <= tsteps; ++t) {
 
         #pragma omp target is_device_ptr(P,Q,U,V) device( did )
         #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
@@ -64,11 +64,11 @@ void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid)
           POLYBENCH_ADI_BODY2;
           for (Index_type j = 1; j < n-1; ++j) {
             POLYBENCH_ADI_BODY3;
-          }  
+          }
           POLYBENCH_ADI_BODY4;
           for (Index_type k = n-2; k >= 1; --k) {
             POLYBENCH_ADI_BODY5;
-          }  
+          }
         }
 
         #pragma omp target is_device_ptr(P,Q,U,V) device( did )
@@ -86,10 +86,10 @@ void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid)
 
       } // tsteps
 
-    } // run_reps  
-    stopTimer(); 
+    } // run_reps
+    stopTimer();
 
-    POLYBENCH_ADI_DATA_TEARDOWN_OMP_TARGET;  
+    POLYBENCH_ADI_DATA_TEARDOWN_OMP_TARGET;
 
   } else if ( vid == RAJA_OpenMPTarget ) {
 
@@ -162,9 +162,9 @@ void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid)
     POLYBENCH_ADI_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  POLYBENCH_ADI : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  POLYBENCH_ADI : Unknown OMP Target variant id = " << vid << std::endl;
   }
-}    
+}
 
 } // end namespace polybench
 } // end namespace rajaperf
diff --git a/src/polybench/POLYBENCH_ADI-Seq.cpp b/src/polybench/POLYBENCH_ADI-Seq.cpp
index 183d56c4c..854a0fdf9 100644
--- a/src/polybench/POLYBENCH_ADI-Seq.cpp
+++ b/src/polybench/POLYBENCH_ADI-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,12 +13,12 @@
 #include <iostream>
 #include <cstring>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
-void POLYBENCH_ADI::runSeqVariant(VariantID vid)
+void POLYBENCH_ADI::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -31,17 +31,17 @@ void POLYBENCH_ADI::runSeqVariant(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        for (Index_type t = 1; t <= tsteps; ++t) { 
+        for (Index_type t = 1; t <= tsteps; ++t) {
 
           for (Index_type i = 1; i < n-1; ++i) {
             POLYBENCH_ADI_BODY2;
             for (Index_type j = 1; j < n-1; ++j) {
               POLYBENCH_ADI_BODY3;
-            }  
+            }
             POLYBENCH_ADI_BODY4;
             for (Index_type k = n-2; k >= 1; --k) {
               POLYBENCH_ADI_BODY5;
-            }  
+            }
           }
 
           for (Index_type i = 1; i < n-1; ++i) {
@@ -52,7 +52,7 @@ void POLYBENCH_ADI::runSeqVariant(VariantID vid)
             POLYBENCH_ADI_BODY8;
             for (Index_type k = n-2; k >= 1; --k) {
               POLYBENCH_ADI_BODY9;
-            }  
+            }
           }
 
         }  // tstep loop
@@ -172,9 +172,9 @@ void POLYBENCH_ADI::runSeqVariant(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        for (Index_type t = 1; t <= tsteps; ++t) { 
+        for (Index_type t = 1; t <= tsteps; ++t) {
 
-          RAJA::kernel<EXEC_POL>( 
+          RAJA::kernel<EXEC_POL>(
             RAJA::make_tuple(RAJA::RangeSegment{1, n-1},
                              RAJA::RangeSegment{1, n-1},
                              RAJA::RangeStrideSegment{n-2, 0, -1}),
@@ -208,7 +208,7 @@ void POLYBENCH_ADI::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\nPOLYBENCH_ADI  Unknown variant id = " << vid << std::endl;
+      getCout() << "\nPOLYBENCH_ADI  Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp
index c36b41050..7d0844e69 100644
--- a/src/polybench/POLYBENCH_ADI.cpp
+++ b/src/polybench/POLYBENCH_ADI.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,7 +21,7 @@ POLYBENCH_ADI::POLYBENCH_ADI(const RunParams& params)
   : KernelBase(rajaperf::Polybench_ADI, params)
 {
   Index_type n_default = 1000;
-  
+
   setDefaultProblemSize( (n_default-2) * (n_default-2) );
   setDefaultReps(4);
 
@@ -39,7 +39,7 @@ POLYBENCH_ADI::POLYBENCH_ADI(const RunParams& params)
   setFLOPsPerRep( m_tsteps * ( (15 + 2) * (m_n-2)*(m_n-2) +
                                (15 + 2) * (m_n-2)*(m_n-2) ) );
 
-  checksum_scale_factor = 0.0000001 * 
+  checksum_scale_factor = 0.0000001 *
               ( static_cast<Checksum_type>(getDefaultProblemSize()) /
                                            getActualProblemSize() );
 
@@ -69,7 +69,7 @@ POLYBENCH_ADI::~POLYBENCH_ADI()
 {
 }
 
-void POLYBENCH_ADI::setUp(VariantID vid)
+void POLYBENCH_ADI::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_U, m_n * m_n, 0.0, vid);
   allocAndInitData(m_V, m_n * m_n, vid);
@@ -77,12 +77,12 @@ void POLYBENCH_ADI::setUp(VariantID vid)
   allocAndInitData(m_Q, m_n * m_n, vid);
 }
 
-void POLYBENCH_ADI::updateChecksum(VariantID vid)
+void POLYBENCH_ADI::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_U, m_n * m_n, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_U, m_n * m_n, checksum_scale_factor );
 }
 
-void POLYBENCH_ADI::tearDown(VariantID vid)
+void POLYBENCH_ADI::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_U);
diff --git a/src/polybench/POLYBENCH_ADI.hpp b/src/polybench/POLYBENCH_ADI.hpp
index bec422925..7cd579964 100644
--- a/src/polybench/POLYBENCH_ADI.hpp
+++ b/src/polybench/POLYBENCH_ADI.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -186,17 +186,27 @@ class POLYBENCH_ADI : public KernelBase
 
   ~POLYBENCH_ADI();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Index_type m_n;
   Index_type m_tsteps;
 
diff --git a/src/polybench/POLYBENCH_ATAX-Cuda.cpp b/src/polybench/POLYBENCH_ATAX-Cuda.cpp
index 58d37fb80..66b0d3218 100644
--- a/src/polybench/POLYBENCH_ATAX-Cuda.cpp
+++ b/src/polybench/POLYBENCH_ATAX-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,11 +21,6 @@ namespace rajaperf
 namespace polybench
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
 #define POLYBENCH_ATAX_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(tmp, m_tmp, N); \
   allocAndInitCudaDeviceData(y, m_y, N); \
@@ -41,10 +36,12 @@ namespace polybench
   deallocCudaDeviceData(A);
 
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_atax_1(Real_ptr A, Real_ptr x, Real_ptr y, Real_ptr tmp,
                             Index_type N)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
 
   if (i < N) {
     POLYBENCH_ATAX_BODY1;
@@ -55,10 +52,12 @@ __global__ void poly_atax_1(Real_ptr A, Real_ptr x, Real_ptr y, Real_ptr tmp,
   }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_atax_2(Real_ptr A, Real_ptr tmp, Real_ptr y,
                             Index_type N)
 {
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type j = blockIdx.x * block_size + threadIdx.x;
 
   if (j < N) {
     POLYBENCH_ATAX_BODY4;
@@ -69,11 +68,12 @@ __global__ void poly_atax_2(Real_ptr A, Real_ptr tmp, Real_ptr y,
   }
 }
 
-template< typename Lambda >
+template < size_t block_size, typename Lambda >
+__launch_bounds__(block_size)
 __global__ void poly_atax_lam(Index_type N,
                               Lambda body)
 {
-  Index_type ti = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type ti = blockIdx.x * block_size + threadIdx.x;
 
   if (ti < N) {
     body(ti);
@@ -81,7 +81,8 @@ __global__ void poly_atax_lam(Index_type N,
 }
 
 
-void POLYBENCH_ATAX::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -96,10 +97,10 @@ void POLYBENCH_ATAX::runCudaVariant(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size);
 
-      poly_atax_1<<<grid_size, block_size>>>(A, x, y, tmp, N);
+      poly_atax_1<block_size><<<grid_size, block_size>>>(A, x, y, tmp, N);
       cudaErrchk( cudaGetLastError() );
 
-      poly_atax_2<<<grid_size, block_size>>>(A, tmp, y, N);
+      poly_atax_2<block_size><<<grid_size, block_size>>>(A, tmp, y, N);
       cudaErrchk( cudaGetLastError() );
 
     }
@@ -116,7 +117,7 @@ void POLYBENCH_ATAX::runCudaVariant(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size);
 
-      poly_atax_lam<<<grid_size, block_size>>>(N,
+      poly_atax_lam<block_size><<<grid_size, block_size>>>(N,
         [=] __device__ (Index_type i) {
           POLYBENCH_ATAX_BODY1;
           for (Index_type j = 0; j < N; ++j ) {
@@ -127,7 +128,7 @@ void POLYBENCH_ATAX::runCudaVariant(VariantID vid)
       );
       cudaErrchk( cudaGetLastError() );
 
-      poly_atax_lam<<<grid_size, block_size>>>(N,
+      poly_atax_lam<block_size><<<grid_size, block_size>>>(N,
         [=] __device__ (Index_type j) {
           POLYBENCH_ATAX_BODY4;
           for (Index_type i = 0; i < N; ++i ) {
@@ -225,11 +226,12 @@ void POLYBENCH_ATAX::runCudaVariant(VariantID vid)
     POLYBENCH_ATAX_TEARDOWN_CUDA;
 
   } else {
-      std::cout << "\n  POLYBENCH_ATAX : Unknown Cuda variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_ATAX : Unknown Cuda variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_ATAX, Cuda)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_ATAX-Hip.cpp b/src/polybench/POLYBENCH_ATAX-Hip.cpp
index 6d393a83b..8e1078c89 100644
--- a/src/polybench/POLYBENCH_ATAX-Hip.cpp
+++ b/src/polybench/POLYBENCH_ATAX-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,11 +21,6 @@ namespace rajaperf
 namespace polybench
 {
 
-  //
-  // Define thread block size for Hip execution
-  //
-  const size_t block_size = 256;
-
 #define POLYBENCH_ATAX_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(tmp, m_tmp, N); \
   allocAndInitHipDeviceData(y, m_y, N); \
@@ -41,10 +36,12 @@ namespace polybench
   deallocHipDeviceData(A);
 
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_atax_1(Real_ptr A, Real_ptr x, Real_ptr y, Real_ptr tmp,
                             Index_type N)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
 
    if (i < N) {
      POLYBENCH_ATAX_BODY1;
@@ -55,10 +52,12 @@ __global__ void poly_atax_1(Real_ptr A, Real_ptr x, Real_ptr y, Real_ptr tmp,
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_atax_2(Real_ptr A, Real_ptr tmp, Real_ptr y,
                             Index_type N)
 {
-   Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type j = blockIdx.x * block_size + threadIdx.x;
 
    if (j < N) {
      POLYBENCH_ATAX_BODY4;
@@ -69,11 +68,12 @@ __global__ void poly_atax_2(Real_ptr A, Real_ptr tmp, Real_ptr y,
    }
 }
 
-template< typename Lambda >
+template < size_t block_size, typename Lambda >
+__launch_bounds__(block_size)
 __global__ void poly_atax_lam(Index_type N,
                               Lambda body)
 {
-  Index_type ti = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type ti = blockIdx.x * block_size + threadIdx.x;
 
   if (ti < N) {
     body(ti);
@@ -81,7 +81,8 @@ __global__ void poly_atax_lam(Index_type N,
 }
 
 
-void POLYBENCH_ATAX::runHipVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -96,12 +97,12 @@ void POLYBENCH_ATAX::runHipVariant(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size);
 
-      hipLaunchKernelGGL((poly_atax_1), 
+      hipLaunchKernelGGL((poly_atax_1<block_size>),
                          dim3(grid_size), dim3(block_size), 0, 0,
                          A, x, y, tmp, N);
       hipErrchk( hipGetLastError() );
 
-      hipLaunchKernelGGL((poly_atax_2), 
+      hipLaunchKernelGGL((poly_atax_2<block_size>),
                          dim3(grid_size), dim3(block_size), 0, 0,
                          A, tmp, y, N);
       hipErrchk( hipGetLastError() );
@@ -128,7 +129,7 @@ void POLYBENCH_ATAX::runHipVariant(VariantID vid)
         POLYBENCH_ATAX_BODY3;
       };
 
-      hipLaunchKernelGGL((poly_atax_lam<decltype(poly_atax_1_lambda)>),
+      hipLaunchKernelGGL((poly_atax_lam<block_size, decltype(poly_atax_1_lambda)>),
         dim3(grid_size), dim3(block_size), 0, 0,
         N, poly_atax_1_lambda);
       hipErrchk( hipGetLastError() );
@@ -141,7 +142,7 @@ void POLYBENCH_ATAX::runHipVariant(VariantID vid)
         POLYBENCH_ATAX_BODY6;
       };
 
-      hipLaunchKernelGGL((poly_atax_lam<decltype(poly_atax_2_lambda)>),
+      hipLaunchKernelGGL((poly_atax_lam<block_size, decltype(poly_atax_2_lambda)>),
         dim3(grid_size), dim3(block_size), 0, 0,
         N, poly_atax_2_lambda);
       hipErrchk( hipGetLastError() );
@@ -232,11 +233,12 @@ void POLYBENCH_ATAX::runHipVariant(VariantID vid)
     POLYBENCH_ATAX_TEARDOWN_HIP;
 
   } else {
-      std::cout << "\n  POLYBENCH_ATAX : Unknown Hip variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_ATAX : Unknown Hip variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_ATAX, Hip)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_ATAX-OMP.cpp b/src/polybench/POLYBENCH_ATAX-OMP.cpp
index 8b7bf1113..504a293a3 100644
--- a/src/polybench/POLYBENCH_ATAX-OMP.cpp
+++ b/src/polybench/POLYBENCH_ATAX-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -19,7 +19,7 @@ namespace polybench
 {
 
 
-void POLYBENCH_ATAX::runOpenMPVariant(VariantID vid)
+void POLYBENCH_ATAX::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -60,19 +60,19 @@ void POLYBENCH_ATAX::runOpenMPVariant(VariantID vid)
 
     case Lambda_OpenMP : {
 
-      auto poly_atax_base_lam2 = [=] (Index_type i, Index_type j, 
+      auto poly_atax_base_lam2 = [=] (Index_type i, Index_type j,
                                       Real_type &dot) {
                                    POLYBENCH_ATAX_BODY2;
                                  };
-      auto poly_atax_base_lam3 = [=] (Index_type i, 
+      auto poly_atax_base_lam3 = [=] (Index_type i,
                                       Real_type &dot) {
                                    POLYBENCH_ATAX_BODY3;
                                   };
-      auto poly_atax_base_lam5 = [=] (Index_type i, Index_type j , 
+      auto poly_atax_base_lam5 = [=] (Index_type i, Index_type j ,
                                       Real_type &dot) {
                                    POLYBENCH_ATAX_BODY5;
                                   };
-      auto poly_atax_base_lam6 = [=] (Index_type j, 
+      auto poly_atax_base_lam6 = [=] (Index_type j,
                                       Real_type &dot) {
                                    POLYBENCH_ATAX_BODY6;
                                   };
@@ -148,10 +148,10 @@ void POLYBENCH_ATAX::runOpenMPVariant(VariantID vid)
           >
         >;
 
-      
+
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-       
+
         RAJA::kernel_param<EXEC_POL1>(
           RAJA::make_tuple(RAJA::RangeSegment{0, N},
                            RAJA::RangeSegment{0, N}),
@@ -172,21 +172,21 @@ void POLYBENCH_ATAX::runOpenMPVariant(VariantID vid)
           poly_atax_lam5,
           poly_atax_lam6
 
-        ); 
+        );
 
       }
       stopTimer();
-      
+
       break;
     }
 
     default : {
-      std::cout << "\n  POLYBENCH_ATAX : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_ATAX : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp b/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp
index 69b50d8cb..1f9c23844 100644
--- a/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
@@ -44,7 +44,7 @@ namespace polybench
   deallocOpenMPDeviceData(A, did);
 
 
-void POLYBENCH_ATAX::runOpenMPTargetVariant(VariantID vid)
+void POLYBENCH_ATAX::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -66,7 +66,7 @@ void POLYBENCH_ATAX::runOpenMPTargetVariant(VariantID vid)
         }
         POLYBENCH_ATAX_BODY3;
       }
-        
+
       #pragma omp target is_device_ptr(y,tmp,A) device( did )
       #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
       for (Index_type j = 0; j < N; ++j ) {
@@ -153,7 +153,7 @@ void POLYBENCH_ATAX::runOpenMPTargetVariant(VariantID vid)
     POLYBENCH_ATAX_TEARDOWN_OMP_TARGET;
 
   } else {
-      std::cout << "\n  POLYBENCH_ATAX : Unknown OMP Target variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_ATAX : Unknown OMP Target variant id = " << vid << std::endl;
   }
 
 }
@@ -162,4 +162,4 @@ void POLYBENCH_ATAX::runOpenMPTargetVariant(VariantID vid)
 } // end namespace rajaperf
 
 #endif  // RAJA_ENABLE_TARGET_OPENMP
-  
+
diff --git a/src/polybench/POLYBENCH_ATAX-Seq.cpp b/src/polybench/POLYBENCH_ATAX-Seq.cpp
index f4bb51937..ecb98f3e8 100644
--- a/src/polybench/POLYBENCH_ATAX-Seq.cpp
+++ b/src/polybench/POLYBENCH_ATAX-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,7 +18,7 @@ namespace rajaperf
 namespace polybench
 {
 
-void POLYBENCH_ATAX::runSeqVariant(VariantID vid)
+void POLYBENCH_ATAX::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps= getRunReps();
 
@@ -57,19 +57,19 @@ void POLYBENCH_ATAX::runSeqVariant(VariantID vid)
 #if defined(RUN_RAJA_SEQ)
     case Lambda_Seq : {
 
-      auto poly_atax_base_lam2 = [=] (Index_type i, Index_type j, 
+      auto poly_atax_base_lam2 = [=] (Index_type i, Index_type j,
                                       Real_type &dot) {
                                    POLYBENCH_ATAX_BODY2;
                                  };
-      auto poly_atax_base_lam3 = [=] (Index_type i, 
+      auto poly_atax_base_lam3 = [=] (Index_type i,
                                       Real_type &dot) {
                                    POLYBENCH_ATAX_BODY3;
                                   };
-      auto poly_atax_base_lam5 = [=] (Index_type i, Index_type j , 
+      auto poly_atax_base_lam5 = [=] (Index_type i, Index_type j ,
                                       Real_type &dot) {
                                    POLYBENCH_ATAX_BODY5;
                                   };
-      auto poly_atax_base_lam6 = [=] (Index_type j, 
+      auto poly_atax_base_lam6 = [=] (Index_type j,
                                       Real_type &dot) {
                                    POLYBENCH_ATAX_BODY6;
                                   };
@@ -148,8 +148,8 @@ void POLYBENCH_ATAX::runSeqVariant(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::kernel_param<EXEC_POL1>( 
-          RAJA::make_tuple(RAJA::RangeSegment{0, N}, 
+        RAJA::kernel_param<EXEC_POL1>(
+          RAJA::make_tuple(RAJA::RangeSegment{0, N},
                            RAJA::RangeSegment{0, N}),
           RAJA::tuple<Real_type>{0.0},
 
@@ -158,8 +158,8 @@ void POLYBENCH_ATAX::runSeqVariant(VariantID vid)
           poly_atax_lam3
 
         );
-        
-        RAJA::kernel_param<EXEC_POL2>( 
+
+        RAJA::kernel_param<EXEC_POL2>(
           RAJA::make_tuple(RAJA::RangeSegment{0, N},
                            RAJA::RangeSegment{0, N}),
           RAJA::tuple<Real_type>{0.0},
@@ -178,7 +178,7 @@ void POLYBENCH_ATAX::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  POLYBENCH_ATAX : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_ATAX : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp
index e06917239..44a805518 100644
--- a/src/polybench/POLYBENCH_ATAX.cpp
+++ b/src/polybench/POLYBENCH_ATAX.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -29,7 +29,7 @@ POLYBENCH_ATAX::POLYBENCH_ATAX(const RunParams& params)
   m_N = std::sqrt( getTargetProblemSize() )+1;
 
 
-  setActualProblemSize( m_N * m_N ); 
+  setActualProblemSize( m_N * m_N );
 
   setItsPerRep( m_N + m_N );
   setKernelsPerRep(2);
@@ -41,7 +41,7 @@ POLYBENCH_ATAX::POLYBENCH_ATAX(const RunParams& params)
   setFLOPsPerRep(2 * m_N*m_N +
                  2 * m_N*m_N );
 
-  checksum_scale_factor = 0.001 * 
+  checksum_scale_factor = 0.001 *
               ( static_cast<Checksum_type>(getDefaultProblemSize()) /
                                            getActualProblemSize() );
 
@@ -71,7 +71,7 @@ POLYBENCH_ATAX::~POLYBENCH_ATAX()
 {
 }
 
-void POLYBENCH_ATAX::setUp(VariantID vid)
+void POLYBENCH_ATAX::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   allocAndInitData(m_tmp, m_N, vid);
@@ -80,12 +80,12 @@ void POLYBENCH_ATAX::setUp(VariantID vid)
   allocAndInitDataConst(m_y, m_N, 0.0, vid);
 }
 
-void POLYBENCH_ATAX::updateChecksum(VariantID vid)
+void POLYBENCH_ATAX::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_y, m_N, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_y, m_N, checksum_scale_factor );
 }
 
-void POLYBENCH_ATAX::tearDown(VariantID vid)
+void POLYBENCH_ATAX::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_tmp);
diff --git a/src/polybench/POLYBENCH_ATAX.hpp b/src/polybench/POLYBENCH_ATAX.hpp
index d2c5ec63e..8f28a1470 100644
--- a/src/polybench/POLYBENCH_ATAX.hpp
+++ b/src/polybench/POLYBENCH_ATAX.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -106,17 +106,27 @@ class POLYBENCH_ATAX : public KernelBase
 
   ~POLYBENCH_ATAX();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Index_type m_N;
   Real_ptr m_tmp;
   Real_ptr m_y;
diff --git a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp
index a6c67b852..6b4e8c636 100644
--- a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp
+++ b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,15 +22,16 @@ namespace polybench
 {
 
   //
-  // Define thread block size for CUDA execution
+  // Define thread block shape for CUDA execution
   //
-  const size_t block_size = 256;
+#define j_block_sz (32)
+#define i_block_sz (block_size / j_block_sz)
 
-  constexpr size_t j_block_sz = 32;
-  constexpr size_t i_block_sz = 8;
+#define FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \
+  j_block_sz, i_block_sz
 
 #define FDTD_2D_THREADS_PER_BLOCK_CUDA \
-  dim3 nthreads_per_block234(j_block_sz, i_block_sz, 1);
+  dim3 nthreads_per_block234(FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA, 1);
 
 #define FDTD_2D_NBLOCKS_CUDA \
   dim3 nblocks234(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(ny, j_block_sz)), \
@@ -52,89 +53,101 @@ namespace polybench
   deallocCudaDeviceData(fict);
 
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_fdtd2d_1(Real_ptr ey, Real_ptr fict,
                               Index_type ny, Index_type t)
 {
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type j = blockIdx.x * block_size + threadIdx.x;
 
   if (j < ny) {
     POLYBENCH_FDTD_2D_BODY1;
   }
 }
 
-template< typename Lambda >
+template < size_t block_size, typename Lambda >
+__launch_bounds__(block_size)
 __global__ void poly_fdtd2d_1_lam(Index_type ny, Lambda body)
 {
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type j = blockIdx.x * block_size + threadIdx.x;
 
   if (j < ny) {
     body(j);
   }
 }
 
-__global__ void poly_fdtd2d_2(Real_ptr ey, Real_ptr hz, 
+template < size_t j_block_size, size_t i_block_size >
+__launch_bounds__(j_block_size*i_block_size)
+__global__ void poly_fdtd2d_2(Real_ptr ey, Real_ptr hz,
                               Index_type nx, Index_type ny)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if (i > 0 && i < nx && j < ny) {
     POLYBENCH_FDTD_2D_BODY2;
   }
 }
 
-template< typename Lambda >
+template < size_t j_block_size, size_t i_block_size, typename Lambda >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_fdtd2d_2_lam(Index_type nx, Index_type ny,
                                   Lambda body)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if (i > 0 && i < nx && j < ny) {
-    body(i, j); 
+    body(i, j);
   }
 }
 
+template < size_t j_block_size, size_t i_block_size >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_fdtd2d_3(Real_ptr ex, Real_ptr hz,
                               Index_type nx, Index_type ny)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if (i < nx && j > 0 && j < ny) {
     POLYBENCH_FDTD_2D_BODY3;
   }
 }
 
-template< typename Lambda >
+template < size_t j_block_size, size_t i_block_size, typename Lambda >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_fdtd2d_3_lam(Index_type nx, Index_type ny,
                                   Lambda body)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if (i < nx && j > 0 && j < ny) {
     body(i, j);
   }
 }
 
+template < size_t j_block_size, size_t i_block_size >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_fdtd2d_4(Real_ptr hz, Real_ptr ex, Real_ptr ey,
                               Index_type nx, Index_type ny)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if (i < nx-1 && j < ny-1) {
     POLYBENCH_FDTD_2D_BODY4;
   }
 }
 
-template< typename Lambda >
+template < size_t j_block_size, size_t i_block_size, typename Lambda >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_fdtd2d_4_lam(Index_type nx, Index_type ny,
                                   Lambda body)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if (i < nx-1 && j < ny-1) {
     body(i, j);
@@ -142,7 +155,8 @@ __global__ void poly_fdtd2d_4_lam(Index_type nx, Index_type ny,
 }
 
 
-void POLYBENCH_FDTD_2D::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -159,19 +173,22 @@ void POLYBENCH_FDTD_2D::runCudaVariant(VariantID vid)
 
         const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size);
 
-        poly_fdtd2d_1<<<grid_size1, block_size>>>(ey, fict, ny, t);
+        poly_fdtd2d_1<block_size><<<grid_size1, block_size>>>(ey, fict, ny, t);
         cudaErrchk( cudaGetLastError() );
 
         FDTD_2D_THREADS_PER_BLOCK_CUDA;
         FDTD_2D_NBLOCKS_CUDA;
 
-        poly_fdtd2d_2<<<nblocks234, nthreads_per_block234>>>(ey, hz, nx, ny);
+        poly_fdtd2d_2<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                     <<<nblocks234, nthreads_per_block234>>>(ey, hz, nx, ny);
         cudaErrchk( cudaGetLastError() );
 
-        poly_fdtd2d_3<<<nblocks234, nthreads_per_block234>>>(ex, hz, nx, ny);
+        poly_fdtd2d_3<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                     <<<nblocks234, nthreads_per_block234>>>(ex, hz, nx, ny);
         cudaErrchk( cudaGetLastError() );
 
-        poly_fdtd2d_4<<<nblocks234, nthreads_per_block234>>>(hz, ex, ey, nx, ny);
+        poly_fdtd2d_4<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                     <<<nblocks234, nthreads_per_block234>>>(hz, ex, ey, nx, ny);
         cudaErrchk( cudaGetLastError() );
 
       } // tstep loop
@@ -192,7 +209,7 @@ void POLYBENCH_FDTD_2D::runCudaVariant(VariantID vid)
 
         const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size);
 
-        poly_fdtd2d_1_lam<<<grid_size1, block_size>>>(ny,
+        poly_fdtd2d_1_lam<block_size><<<grid_size1, block_size>>>(ny,
           [=] __device__ (Index_type j) {
             POLYBENCH_FDTD_2D_BODY1;
           }
@@ -201,21 +218,24 @@ void POLYBENCH_FDTD_2D::runCudaVariant(VariantID vid)
         FDTD_2D_THREADS_PER_BLOCK_CUDA;
         FDTD_2D_NBLOCKS_CUDA;
 
-        poly_fdtd2d_2_lam<<<nblocks234, nthreads_per_block234>>>(nx, ny,
+        poly_fdtd2d_2_lam<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                         <<<nblocks234, nthreads_per_block234>>>(nx, ny,
           [=] __device__ (Index_type i, Index_type j) {
             POLYBENCH_FDTD_2D_BODY2;
           }
         );
         cudaErrchk( cudaGetLastError() );
 
-        poly_fdtd2d_3_lam<<<nblocks234, nthreads_per_block234>>>(nx, ny,
+        poly_fdtd2d_3_lam<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                         <<<nblocks234, nthreads_per_block234>>>(nx, ny,
           [=] __device__ (Index_type i, Index_type j) {
             POLYBENCH_FDTD_2D_BODY3;
           }
         );
         cudaErrchk( cudaGetLastError() );
 
-        poly_fdtd2d_4_lam<<<nblocks234, nthreads_per_block234>>>(nx, ny,
+        poly_fdtd2d_4_lam<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                         <<<nblocks234, nthreads_per_block234>>>(nx, ny,
           [=] __device__ (Index_type i, Index_type j) {
             POLYBENCH_FDTD_2D_BODY4;
           }
@@ -296,11 +316,12 @@ void POLYBENCH_FDTD_2D::runCudaVariant(VariantID vid)
     POLYBENCH_FDTD_2D_TEARDOWN_CUDA;
 
   } else {
-      std::cout << "\n  POLYBENCH_FDTD_2D : Unknown Cuda variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_FDTD_2D : Unknown Cuda variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_FDTD_2D, Cuda)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp
index b627d84f8..0ca25f1e0 100644
--- a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp
+++ b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,15 +22,16 @@ namespace polybench
 {
 
   //
-  // Define thread block size for Hip execution
+  // Define thread block shape for Hip execution
   //
-  const size_t block_size = 256;
+#define j_block_sz (32)
+#define i_block_sz (block_size / j_block_sz)
 
-  constexpr size_t j_block_sz = 32;
-  constexpr size_t i_block_sz = 8;
+#define FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \
+  j_block_sz, i_block_sz
 
 #define FDTD_2D_THREADS_PER_BLOCK_HIP \
-  dim3 nthreads_per_block234(j_block_sz, i_block_sz, 1);
+  dim3 nthreads_per_block234(FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, 1);
 
 #define FDTD_2D_NBLOCKS_HIP \
   dim3 nblocks234(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(ny, j_block_sz)), \
@@ -51,89 +52,101 @@ namespace polybench
   deallocHipDeviceData(fict);
 
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_fdtd2d_1(Real_ptr ey, Real_ptr fict,
                               Index_type ny, Index_type t)
 {
-   Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type j = blockIdx.x * block_size + threadIdx.x;
 
    if (j < ny) {
      POLYBENCH_FDTD_2D_BODY1;
    }
 }
 
-template< typename Lambda >
+template < size_t block_size, typename Lambda >
+__launch_bounds__(block_size)
 __global__ void poly_fdtd2d_1_lam(Index_type ny, Lambda body)
 {
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type j = blockIdx.x * block_size + threadIdx.x;
 
   if (j < ny) {
     body(j);
   }
 }
 
+template < size_t j_block_size, size_t i_block_size >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_fdtd2d_2(Real_ptr ey, Real_ptr hz,
                               Index_type nx, Index_type ny)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if (i > 0 && i < nx && j < ny) {
     POLYBENCH_FDTD_2D_BODY2;
   }
 }
 
-template< typename Lambda >
+template < size_t j_block_size, size_t i_block_size, typename Lambda >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_fdtd2d_2_lam(Index_type nx, Index_type ny,
                                   Lambda body)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if (i > 0 && i < nx && j < ny) {
     body(i, j);
   }
 }
 
+template < size_t j_block_size, size_t i_block_size >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_fdtd2d_3(Real_ptr ex, Real_ptr hz,
                               Index_type nx, Index_type ny)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if (i < nx && j > 0 && j < ny) {
     POLYBENCH_FDTD_2D_BODY3;
   }
 }
 
-template< typename Lambda >
+template < size_t j_block_size, size_t i_block_size, typename Lambda >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_fdtd2d_3_lam(Index_type nx, Index_type ny,
                                   Lambda body)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if (i < nx && j > 0 && j < ny) {
     body(i, j);
   }
 }
 
+template < size_t j_block_size, size_t i_block_size >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_fdtd2d_4(Real_ptr hz, Real_ptr ex, Real_ptr ey,
                               Index_type nx, Index_type ny)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if (i < nx-1 && j < ny-1) {
     POLYBENCH_FDTD_2D_BODY4;
   }
 }
 
-template< typename Lambda >
+template < size_t j_block_size, size_t i_block_size, typename Lambda >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_fdtd2d_4_lam(Index_type nx, Index_type ny,
                                   Lambda body)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if (i < nx-1 && j < ny-1) {
     body(i, j);
@@ -141,7 +154,8 @@ __global__ void poly_fdtd2d_4_lam(Index_type nx, Index_type ny,
 }
 
 
-void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -157,25 +171,25 @@ void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid)
       for (t = 0; t < tsteps; ++t) {
 
         const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size);
-        hipLaunchKernelGGL((poly_fdtd2d_1), 
-                           dim3(grid_size1), dim3(block_size), 0, 0, 
+        hipLaunchKernelGGL((poly_fdtd2d_1<block_size>),
+                           dim3(grid_size1), dim3(block_size), 0, 0,
                            ey, fict, ny, t);
         hipErrchk( hipGetLastError() );
 
         FDTD_2D_THREADS_PER_BLOCK_HIP;
         FDTD_2D_NBLOCKS_HIP;
 
-        hipLaunchKernelGGL((poly_fdtd2d_2), 
+        hipLaunchKernelGGL((poly_fdtd2d_2<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
                            dim3(nblocks234), dim3(nthreads_per_block234), 0, 0,
                            ey, hz, nx, ny);
         hipErrchk( hipGetLastError() );
 
-        hipLaunchKernelGGL((poly_fdtd2d_3), 
+        hipLaunchKernelGGL((poly_fdtd2d_3<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
                            dim3(nblocks234), dim3(nthreads_per_block234), 0, 0,
                            ex, hz, nx, ny);
         hipErrchk( hipGetLastError() );
 
-        hipLaunchKernelGGL((poly_fdtd2d_4), 
+        hipLaunchKernelGGL((poly_fdtd2d_4<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
                            dim3(nblocks234), dim3(nthreads_per_block234), 0, 0,
                            hz, ex, ey, nx, ny);
         hipErrchk( hipGetLastError() );
@@ -202,7 +216,7 @@ void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid)
           POLYBENCH_FDTD_2D_BODY1;
         };
 
-        hipLaunchKernelGGL(poly_fdtd2d_1_lam<decltype(poly_fdtd2d_1_lambda)>,
+        hipLaunchKernelGGL((poly_fdtd2d_1_lam<block_size, decltype(poly_fdtd2d_1_lambda)>),
           dim3(grid_size1), dim3(block_size), 0, 0,
           ny, poly_fdtd2d_1_lambda);
         hipErrchk( hipGetLastError() );
@@ -210,32 +224,32 @@ void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid)
         FDTD_2D_THREADS_PER_BLOCK_HIP;
         FDTD_2D_NBLOCKS_HIP;
 
-        auto poly_fdtd2d_2_lambda = 
+        auto poly_fdtd2d_2_lambda =
           [=] __device__ (Index_type i, Index_type j) {
             POLYBENCH_FDTD_2D_BODY2;
           };
 
-        hipLaunchKernelGGL((poly_fdtd2d_2_lam<decltype(poly_fdtd2d_2_lambda)>),
+        hipLaunchKernelGGL((poly_fdtd2d_2_lam<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_fdtd2d_2_lambda)>),
                            dim3(nblocks234), dim3(nthreads_per_block234), 0, 0,
                            nx, ny, poly_fdtd2d_2_lambda);
         hipErrchk( hipGetLastError() );
 
-        auto poly_fdtd2d_3_lambda = 
+        auto poly_fdtd2d_3_lambda =
           [=] __device__ (Index_type i, Index_type j) {
             POLYBENCH_FDTD_2D_BODY3;
           };
 
-        hipLaunchKernelGGL((poly_fdtd2d_3_lam<decltype(poly_fdtd2d_3_lambda)>),
+        hipLaunchKernelGGL((poly_fdtd2d_3_lam<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_fdtd2d_3_lambda)>),
                            dim3(nblocks234), dim3(nthreads_per_block234), 0, 0,
                            nx, ny, poly_fdtd2d_3_lambda);
         hipErrchk( hipGetLastError() );
-  
-        auto poly_fdtd2d_4_lambda = 
+
+        auto poly_fdtd2d_4_lambda =
           [=] __device__ (Index_type i, Index_type j) {
             POLYBENCH_FDTD_2D_BODY4;
           };
 
-        hipLaunchKernelGGL((poly_fdtd2d_4_lam<decltype(poly_fdtd2d_4_lambda)>),
+        hipLaunchKernelGGL((poly_fdtd2d_4_lam<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_fdtd2d_4_lambda)>),
                            dim3(nblocks234), dim3(nthreads_per_block234), 0, 0,
                            nx, ny, poly_fdtd2d_4_lambda);
         hipErrchk( hipGetLastError() );
@@ -314,11 +328,12 @@ void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid)
     POLYBENCH_FDTD_2D_TEARDOWN_HIP;
 
   } else {
-      std::cout << "\n  POLYBENCH_FDTD_2D : Unknown Hip variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_FDTD_2D : Unknown Hip variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_FDTD_2D, Hip)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp b/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp
index 157b4b12a..dba8a872a 100644
--- a/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp
+++ b/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
 
-void POLYBENCH_FDTD_2D::runOpenMPVariant(VariantID vid)
+void POLYBENCH_FDTD_2D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -146,7 +146,7 @@ void POLYBENCH_FDTD_2D::runOpenMPVariant(VariantID vid)
 
       using EXEC_POL1 = RAJA::omp_parallel_for_exec;
 
-      using EXEC_POL234 =  
+      using EXEC_POL234 =
         RAJA::KernelPolicy<
           RAJA::statement::For<0, RAJA::omp_parallel_for_exec,
             RAJA::statement::For<1, RAJA::loop_exec,
@@ -191,12 +191,12 @@ void POLYBENCH_FDTD_2D::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\nPOLYBENCH_FDTD_2D  Unknown variant id = " << vid << std::endl;
+      getCout() << "\nPOLYBENCH_FDTD_2D  Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp b/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp
index 0bde775fd..5bb0d03b3 100644
--- a/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp
@@ -1,10 +1,10 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//  
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 #include "POLYBENCH_FDTD_2D.hpp"
 
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
@@ -43,7 +43,7 @@ namespace polybench
   deallocOpenMPDeviceData(fict, did);
 
 
-void POLYBENCH_FDTD_2D::runOpenMPTargetVariant(VariantID vid)
+void POLYBENCH_FDTD_2D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -153,7 +153,7 @@ void POLYBENCH_FDTD_2D::runOpenMPTargetVariant(VariantID vid)
     POLYBENCH_FDTD_2D_TEARDOWN_OMP_TARGET;
 
   } else {
-      std::cout << "\n  POLYBENCH_FDTD_2D : Unknown OMP Target variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_FDTD_2D : Unknown OMP Target variant id = " << vid << std::endl;
   }
 
 }
@@ -162,4 +162,4 @@ void POLYBENCH_FDTD_2D::runOpenMPTargetVariant(VariantID vid)
 } // end namespace rajaperf
 
 #endif  // RAJA_ENABLE_TARGET_OPENMP
-  
+
diff --git a/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp b/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp
index 4e6078778..6ab94557d 100644
--- a/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp
+++ b/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
 
-void POLYBENCH_FDTD_2D::runSeqVariant(VariantID vid)
+void POLYBENCH_FDTD_2D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -31,7 +31,7 @@ void POLYBENCH_FDTD_2D::runSeqVariant(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        for (t = 0; t < tsteps; ++t) { 
+        for (t = 0; t < tsteps; ++t) {
 
           for (Index_type j = 0; j < ny; j++) {
             POLYBENCH_FDTD_2D_BODY1;
@@ -137,7 +137,7 @@ void POLYBENCH_FDTD_2D::runSeqVariant(VariantID vid)
 
       using EXEC_POL1 = RAJA::loop_exec;
 
-      using EXEC_POL234 =  
+      using EXEC_POL234 =
         RAJA::KernelPolicy<
           RAJA::statement::For<0, RAJA::loop_exec,
             RAJA::statement::For<1, RAJA::loop_exec,
@@ -149,9 +149,9 @@ void POLYBENCH_FDTD_2D::runSeqVariant(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        for (t = 0; t < tsteps; ++t) { 
+        for (t = 0; t < tsteps; ++t) {
 
-          RAJA::forall<EXEC_POL1>( RAJA::RangeSegment(0, ny), 
+          RAJA::forall<EXEC_POL1>( RAJA::RangeSegment(0, ny),
             poly_fdtd2d_lam1
           );
 
@@ -184,7 +184,7 @@ void POLYBENCH_FDTD_2D::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\nPOLYBENCH_FDTD_2D  Unknown variant id = " << vid << std::endl;
+      getCout() << "\nPOLYBENCH_FDTD_2D  Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp
index 59e03721c..dce05e76a 100644
--- a/src/polybench/POLYBENCH_FDTD_2D.cpp
+++ b/src/polybench/POLYBENCH_FDTD_2D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -27,7 +27,7 @@ POLYBENCH_FDTD_2D::POLYBENCH_FDTD_2D(const RunParams& params)
   Index_type nx_default = 1000;
   Index_type ny_default = 1000;
 
-  setDefaultProblemSize( std::max( (nx_default-1) * ny_default, 
+  setDefaultProblemSize( std::max( (nx_default-1) * ny_default,
                                     nx_default * (ny_default-1) ) );
   setDefaultReps(8);
 
@@ -36,7 +36,7 @@ POLYBENCH_FDTD_2D::POLYBENCH_FDTD_2D(const RunParams& params)
   m_tsteps = 40;
 
 
-  setActualProblemSize( std::max( (m_nx-1)*m_ny, m_nx*(m_ny-1) ) ); 
+  setActualProblemSize( std::max( (m_nx-1)*m_ny, m_nx*(m_ny-1) ) );
 
   setItsPerRep( m_tsteps * ( m_ny +
                              (m_nx-1)*m_ny +
@@ -90,7 +90,7 @@ POLYBENCH_FDTD_2D::~POLYBENCH_FDTD_2D()
 {
 }
 
-void POLYBENCH_FDTD_2D::setUp(VariantID vid)
+void POLYBENCH_FDTD_2D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_hz, m_nx * m_ny, 0.0, vid);
   allocAndInitData(m_ex, m_nx * m_ny, vid);
@@ -98,12 +98,12 @@ void POLYBENCH_FDTD_2D::setUp(VariantID vid)
   allocAndInitData(m_fict, m_tsteps, vid);
 }
 
-void POLYBENCH_FDTD_2D::updateChecksum(VariantID vid)
+void POLYBENCH_FDTD_2D::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_hz, m_nx * m_ny, checksum_scale_factor);
+  checksum[vid][tune_idx] += calcChecksum(m_hz, m_nx * m_ny, checksum_scale_factor);
 }
 
-void POLYBENCH_FDTD_2D::tearDown(VariantID vid)
+void POLYBENCH_FDTD_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_fict);
diff --git a/src/polybench/POLYBENCH_FDTD_2D.hpp b/src/polybench/POLYBENCH_FDTD_2D.hpp
index a1ead28b2..7d3696293 100644
--- a/src/polybench/POLYBENCH_FDTD_2D.hpp
+++ b/src/polybench/POLYBENCH_FDTD_2D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -104,17 +104,28 @@ class POLYBENCH_FDTD_2D : public KernelBase
 
   ~POLYBENCH_FDTD_2D();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
+                                                         gpu_block_size::MultipleOf<32>>;
+
   Index_type m_nx;
   Index_type m_ny;
   Index_type m_tsteps;
diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp
index bc4d79352..30e9a54b4 100644
--- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp
+++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -20,13 +20,16 @@ namespace polybench
 {
 
 //
-// Define thread block size for CUDA execution
+// Define thread block shape for CUDA execution
 //
-constexpr size_t i_block_sz = 8;
-constexpr size_t j_block_sz = 32;
+#define j_block_sz (32)
+#define i_block_sz (block_size / j_block_sz)
+
+#define POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \
+  j_block_sz, i_block_sz
 
 #define POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_CUDA \
-  dim3 nthreads_per_block(j_block_sz, i_block_sz, 1);
+  dim3 nthreads_per_block(POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA, 1);
 
 #define POLY_FLOYD_WARSHALL_NBLOCKS_CUDA \
   dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)), \
@@ -45,24 +48,27 @@ constexpr size_t j_block_sz = 32;
   deallocCudaDeviceData(pout);
 
 
+template < size_t j_block_size, size_t i_block_size >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_floyd_warshall(Real_ptr pout, Real_ptr pin,
                                     Index_type k,
                                     Index_type N)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
-  if ( i < N && j < N ) { 
+  if ( i < N && j < N ) {
     POLYBENCH_FLOYD_WARSHALL_BODY;
   }
 }
 
-template< typename Lambda >
-__global__ void poly_floyd_warshall_lam(Index_type N, 
+template < size_t j_block_size, size_t i_block_size, typename Lambda >
+__launch_bounds__(j_block_size*i_block_size)
+__global__ void poly_floyd_warshall_lam(Index_type N,
                                         Lambda body)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if ( i < N && j < N ) {
     body(i, j);
@@ -70,7 +76,8 @@ __global__ void poly_floyd_warshall_lam(Index_type N,
 }
 
 
-void POLYBENCH_FLOYD_WARSHALL::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -87,8 +94,9 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariant(VariantID vid)
 
         POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_CUDA;
         POLY_FLOYD_WARSHALL_NBLOCKS_CUDA;
-   
-        poly_floyd_warshall<<<nblocks, nthreads_per_block>>>(pout, pin,
+
+        poly_floyd_warshall<POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                           <<<nblocks, nthreads_per_block>>>(pout, pin,
                                                              k, N);
         cudaErrchk( cudaGetLastError() );
 
@@ -111,7 +119,8 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariant(VariantID vid)
         POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_CUDA;
         POLY_FLOYD_WARSHALL_NBLOCKS_CUDA;
 
-        poly_floyd_warshall_lam<<<nblocks, nthreads_per_block>>>(N,
+        poly_floyd_warshall_lam<POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                               <<<nblocks, nthreads_per_block>>>(N,
           [=] __device__ (Index_type i, Index_type j) {
             POLYBENCH_FLOYD_WARSHALL_BODY;
           }
@@ -166,11 +175,12 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariant(VariantID vid)
     POLYBENCH_FLOYD_WARSHALL_TEARDOWN_CUDA;
 
   } else {
-      std::cout << "\n  POLYBENCH_FLOYD_WARSHALL : Unknown Cuda variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_FLOYD_WARSHALL : Unknown Cuda variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_FLOYD_WARSHALL, Cuda)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp
index af451b139..99b8ea303 100644
--- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp
+++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -20,13 +20,16 @@ namespace polybench
 {
 
 //
-// Define thread block size for Hip execution
+// Define thread block shape for Hip execution
 //
-constexpr size_t i_block_sz = 8;
-constexpr size_t j_block_sz = 32;
+#define j_block_sz (32)
+#define i_block_sz (block_size / j_block_sz)
+
+#define POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \
+  j_block_sz, i_block_sz
 
 #define POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_HIP \
-  dim3 nthreads_per_block(j_block_sz, i_block_sz, 1);
+  dim3 nthreads_per_block(POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, 1);
 
 #define POLY_FLOYD_WARSHALL_NBLOCKS_HIP \
   dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)), \
@@ -44,24 +47,27 @@ constexpr size_t j_block_sz = 32;
   deallocHipDeviceData(pin); \
   deallocHipDeviceData(pout);
 
+template < size_t j_block_size, size_t i_block_size >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_floyd_warshall(Real_ptr pout, Real_ptr pin,
                                     Index_type k,
                                     Index_type N)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if ( i < N && j < N ) {
     POLYBENCH_FLOYD_WARSHALL_BODY;
   }
 }
 
-template< typename Lambda >
+template < size_t j_block_size, size_t i_block_size, typename Lambda >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_floyd_warshall_lam(Index_type N,
                                         Lambda body)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if ( i < N && j < N ) {
     body(i, j);
@@ -69,7 +75,8 @@ __global__ void poly_floyd_warshall_lam(Index_type N,
 }
 
 
-void POLYBENCH_FLOYD_WARSHALL::runHipVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -87,7 +94,7 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariant(VariantID vid)
         POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_HIP;
         POLY_FLOYD_WARSHALL_NBLOCKS_HIP;
 
-        hipLaunchKernelGGL((poly_floyd_warshall),
+        hipLaunchKernelGGL((poly_floyd_warshall<POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
                            dim3(nblocks), dim3(nthreads_per_block), 0, 0,
                            pout, pin,
                            k, N);
@@ -109,16 +116,16 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariant(VariantID vid)
 
       for (Index_type k = 0; k < N; ++k) {
 
-        auto poly_floyd_warshall_lambda = 
+        auto poly_floyd_warshall_lambda =
           [=] __device__ (Index_type i, Index_type j) {
             POLYBENCH_FLOYD_WARSHALL_BODY;
           };
 
         POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_HIP;
-        POLY_FLOYD_WARSHALL_NBLOCKS_HIP; 
+        POLY_FLOYD_WARSHALL_NBLOCKS_HIP;
 
         hipLaunchKernelGGL(
-          (poly_floyd_warshall_lam<decltype(poly_floyd_warshall_lambda)>),
+          (poly_floyd_warshall_lam<POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_floyd_warshall_lambda)>),
           dim3(nblocks), dim3(nthreads_per_block), 0, 0,
           N, poly_floyd_warshall_lambda);
         hipErrchk( hipGetLastError() );
@@ -172,11 +179,12 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariant(VariantID vid)
     POLYBENCH_FLOYD_WARSHALL_TEARDOWN_HIP;
 
   } else {
-      std::cout << "\n  POLYBENCH_FLOYD_WARSHALL : Unknown Hip variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_FLOYD_WARSHALL : Unknown Hip variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_FLOYD_WARSHALL, Hip)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp
index 974fa2342..edb2074f1 100644
--- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp
+++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,13 +18,13 @@
 //#define USE_RAJA_OMP_COLLAPSE
 #undef USE_RAJA_OMP_COLLAPSE
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
- 
-void POLYBENCH_FLOYD_WARSHALL::runOpenMPVariant(VariantID vid)
+
+void POLYBENCH_FLOYD_WARSHALL::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -45,7 +45,7 @@ void POLYBENCH_FLOYD_WARSHALL::runOpenMPVariant(VariantID vid)
 #else
           #pragma omp parallel for
 #endif
-          for (Index_type i = 0; i < N; ++i) {  
+          for (Index_type i = 0; i < N; ++i) {
             for (Index_type j = 0; j < N; ++j) {
               POLYBENCH_FLOYD_WARSHALL_BODY;
             }
@@ -60,7 +60,7 @@ void POLYBENCH_FLOYD_WARSHALL::runOpenMPVariant(VariantID vid)
 
     case Lambda_OpenMP : {
 
-      auto poly_floydwarshall_base_lam = [=](Index_type k, Index_type i, 
+      auto poly_floydwarshall_base_lam = [=](Index_type k, Index_type i,
                                              Index_type j) {
                                            POLYBENCH_FLOYD_WARSHALL_BODY;
                                          };
@@ -89,9 +89,9 @@ void POLYBENCH_FLOYD_WARSHALL::runOpenMPVariant(VariantID vid)
 
     case RAJA_OpenMP : {
 
-      POLYBENCH_FLOYD_WARSHALL_VIEWS_RAJA; 
+      POLYBENCH_FLOYD_WARSHALL_VIEWS_RAJA;
 
-      auto poly_floydwarshall_lam = [=](Index_type k, Index_type i, 
+      auto poly_floydwarshall_lam = [=](Index_type k, Index_type i,
                                         Index_type j) {
                                       POLYBENCH_FLOYD_WARSHALL_BODY_RAJA;
                                      };
@@ -125,7 +125,7 @@ void POLYBENCH_FLOYD_WARSHALL::runOpenMPVariant(VariantID vid)
         RAJA::kernel<EXEC_POL>( RAJA::make_tuple(RAJA::RangeSegment{0, N},
                                                  RAJA::RangeSegment{0, N},
                                                  RAJA::RangeSegment{0, N}),
-          poly_floydwarshall_lam 
+          poly_floydwarshall_lam
         );
 
       }
@@ -135,12 +135,12 @@ void POLYBENCH_FLOYD_WARSHALL::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  POLYBENCH_FLOYD_WARSHALL : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_FLOYD_WARSHALL : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp
index d59441b7a..6c8a9d5fa 100644
--- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp
@@ -1,10 +1,10 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// 
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 #include "POLYBENCH_FLOYD_WARSHALL.hpp"
 
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
@@ -35,7 +35,7 @@ namespace polybench
   deallocOpenMPDeviceData(pout, did);
 
 
-void POLYBENCH_FLOYD_WARSHALL::runOpenMPTargetVariant(VariantID vid)
+void POLYBENCH_FLOYD_WARSHALL::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -98,7 +98,7 @@ void POLYBENCH_FLOYD_WARSHALL::runOpenMPTargetVariant(VariantID vid)
     POLYBENCH_FLOYD_WARSHALL_TEARDOWN_OMP_TARGET;
 
   } else {
-      std::cout << "\n  POLYBENCH_FLOYD_WARSHALL : Unknown OMP Target variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_FLOYD_WARSHALL : Unknown OMP Target variant id = " << vid << std::endl;
   }
 
 }
@@ -107,4 +107,4 @@ void POLYBENCH_FLOYD_WARSHALL::runOpenMPTargetVariant(VariantID vid)
 } // end namespace rajaperf
 
 #endif  // RAJA_ENABLE_TARGET_OPENMP
-  
+
diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp
index 1a698dace..b9f42b0ed 100644
--- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp
+++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
- 
-void POLYBENCH_FLOYD_WARSHALL::runSeqVariant(VariantID vid)
+
+void POLYBENCH_FLOYD_WARSHALL::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps= getRunReps();
 
@@ -31,9 +31,9 @@ void POLYBENCH_FLOYD_WARSHALL::runSeqVariant(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        for (Index_type k = 0; k < N; ++k) { 
-          for (Index_type i = 0; i < N; ++i) { 
-            for (Index_type j = 0; j < N; ++j) { 
+        for (Index_type k = 0; k < N; ++k) {
+          for (Index_type i = 0; i < N; ++i) {
+            for (Index_type j = 0; j < N; ++j) {
               POLYBENCH_FLOYD_WARSHALL_BODY;
             }
           }
@@ -49,7 +49,7 @@ void POLYBENCH_FLOYD_WARSHALL::runSeqVariant(VariantID vid)
 #if defined(RUN_RAJA_SEQ)
     case Lambda_Seq : {
 
-      auto poly_floydwarshall_base_lam = [=](Index_type k, Index_type i, 
+      auto poly_floydwarshall_base_lam = [=](Index_type k, Index_type i,
                                              Index_type j) {
                                            POLYBENCH_FLOYD_WARSHALL_BODY;
                                          };
@@ -73,9 +73,9 @@ void POLYBENCH_FLOYD_WARSHALL::runSeqVariant(VariantID vid)
 
     case RAJA_Seq : {
 
-      POLYBENCH_FLOYD_WARSHALL_VIEWS_RAJA; 
+      POLYBENCH_FLOYD_WARSHALL_VIEWS_RAJA;
 
-      auto poly_floydwarshall_lam = [=](Index_type k, Index_type i, 
+      auto poly_floydwarshall_lam = [=](Index_type k, Index_type i,
                                         Index_type j) {
                                       POLYBENCH_FLOYD_WARSHALL_BODY_RAJA;
                                     };
@@ -97,7 +97,7 @@ void POLYBENCH_FLOYD_WARSHALL::runSeqVariant(VariantID vid)
         RAJA::kernel<EXEC_POL>( RAJA::make_tuple(RAJA::RangeSegment{0, N},
                                                  RAJA::RangeSegment{0, N},
                                                  RAJA::RangeSegment{0, N}),
-          poly_floydwarshall_lam 
+          poly_floydwarshall_lam
         );
 
       }
@@ -108,7 +108,7 @@ void POLYBENCH_FLOYD_WARSHALL::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  POLYBENCH_FLOYD_WARSHALL : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_FLOYD_WARSHALL : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp
index b3306a992..1022ffe4f 100644
--- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp
+++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -23,7 +23,7 @@ POLYBENCH_FLOYD_WARSHALL::POLYBENCH_FLOYD_WARSHALL(const RunParams& params)
 {
   Index_type N_default = 1000;
 
-  setDefaultProblemSize( N_default * N_default ); 
+  setDefaultProblemSize( N_default * N_default );
   setDefaultReps(8);
 
   m_N = std::sqrt( getTargetProblemSize() ) + 1;
@@ -66,19 +66,19 @@ POLYBENCH_FLOYD_WARSHALL::~POLYBENCH_FLOYD_WARSHALL()
 {
 }
 
-void POLYBENCH_FLOYD_WARSHALL::setUp(VariantID vid)
+void POLYBENCH_FLOYD_WARSHALL::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   allocAndInitDataRandSign(m_pin, m_N*m_N, vid);
   allocAndInitDataConst(m_pout, m_N*m_N, 0.0, vid);
 }
 
-void POLYBENCH_FLOYD_WARSHALL::updateChecksum(VariantID vid)
+void POLYBENCH_FLOYD_WARSHALL::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_pout, m_N*m_N, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_pout, m_N*m_N, checksum_scale_factor );
 }
 
-void POLYBENCH_FLOYD_WARSHALL::tearDown(VariantID vid)
+void POLYBENCH_FLOYD_WARSHALL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_pin);
diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp
index ec2bcab9f..283231d29 100644
--- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp
+++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -67,17 +67,28 @@ class POLYBENCH_FLOYD_WARSHALL : public KernelBase
 
   ~POLYBENCH_FLOYD_WARSHALL();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
+                                                         gpu_block_size::MultipleOf<32>>;
+
   Index_type m_N;
 
   Real_ptr m_pin;
diff --git a/src/polybench/POLYBENCH_GEMM-Cuda.cpp b/src/polybench/POLYBENCH_GEMM-Cuda.cpp
index ae586d1f4..5101ebc00 100644
--- a/src/polybench/POLYBENCH_GEMM-Cuda.cpp
+++ b/src/polybench/POLYBENCH_GEMM-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,13 +22,16 @@ namespace polybench
 {
 
 //
-// Define thread block size for CUDA execution
+// Define thread block shape for CUDA execution
 //
-constexpr size_t i_block_sz = 8;
-constexpr size_t j_block_sz = 32;
+#define j_block_sz (32)
+#define i_block_sz (block_size / j_block_sz)
+
+#define POLY_GEMM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \
+  j_block_sz, i_block_sz
 
 #define POLY_GEMM_THREADS_PER_BLOCK_CUDA \
-  dim3 nthreads_per_block(j_block_sz, i_block_sz, 1);
+  dim3 nthreads_per_block(POLY_GEMM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA, 1);
 
 #define POLY_GEMM_NBLOCKS_CUDA \
   dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(nj, j_block_sz)), \
@@ -49,12 +52,14 @@ constexpr size_t j_block_sz = 32;
   deallocCudaDeviceData(C);
 
 
+template < size_t j_block_size, size_t i_block_size >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_gemm(Real_ptr C, Real_ptr A, Real_ptr B,
                           Real_type alpha, Real_type beta,
                           Index_type ni, Index_type nj, Index_type nk)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if ( i < ni && j < nj ) {
     POLYBENCH_GEMM_BODY1;
@@ -66,12 +71,13 @@ __global__ void poly_gemm(Real_ptr C, Real_ptr A, Real_ptr B,
   }
 }
 
-template< typename Lambda >
+template < size_t j_block_size, size_t i_block_size, typename Lambda >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_gemm_lam(Index_type ni, Index_type nj,
                               Lambda body)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if ( i < ni && j < nj ) {
     body(i, j);
@@ -79,7 +85,8 @@ __global__ void poly_gemm_lam(Index_type ni, Index_type nj,
 }
 
 
-void POLYBENCH_GEMM::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_GEMM::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -93,9 +100,10 @@ void POLYBENCH_GEMM::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       POLY_GEMM_THREADS_PER_BLOCK_CUDA;
-      POLY_GEMM_NBLOCKS_CUDA; 
+      POLY_GEMM_NBLOCKS_CUDA;
 
-      poly_gemm<<<nblocks, nthreads_per_block>>>(C, A, B,
+      poly_gemm<POLY_GEMM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+               <<<nblocks, nthreads_per_block>>>(C, A, B,
                                                  alpha, beta,
                                                  ni, nj, nk);
       cudaErrchk( cudaGetLastError() );
@@ -115,7 +123,8 @@ void POLYBENCH_GEMM::runCudaVariant(VariantID vid)
       POLY_GEMM_THREADS_PER_BLOCK_CUDA;
       POLY_GEMM_NBLOCKS_CUDA;
 
-      poly_gemm_lam<<<nblocks, nthreads_per_block>>>(ni, nj,
+      poly_gemm_lam<POLY_GEMM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                   <<<nblocks, nthreads_per_block>>>(ni, nj,
         [=] __device__ (Index_type i, Index_type j) {
           POLYBENCH_GEMM_BODY1;
           POLYBENCH_GEMM_BODY2;
@@ -192,11 +201,12 @@ void POLYBENCH_GEMM::runCudaVariant(VariantID vid)
     POLYBENCH_GEMM_TEARDOWN_CUDA;
 
   } else {
-      std::cout << "\n  POLYBENCH_GEMM : Unknown Cuda variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_GEMM : Unknown Cuda variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_GEMM, Cuda)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_GEMM-Hip.cpp b/src/polybench/POLYBENCH_GEMM-Hip.cpp
index 2d07f0a86..ed2c7fcff 100644
--- a/src/polybench/POLYBENCH_GEMM-Hip.cpp
+++ b/src/polybench/POLYBENCH_GEMM-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,13 +22,16 @@ namespace polybench
 {
 
 //
-// Define thread block size for Hip execution
+// Define thread block shape for Hip execution
 //
-constexpr size_t i_block_sz = 8;
-constexpr size_t j_block_sz = 32;
+#define j_block_sz (32)
+#define i_block_sz (block_size / j_block_sz)
+
+#define POLY_GEMM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \
+  j_block_sz, i_block_sz
 
 #define POLY_GEMM_THREADS_PER_BLOCK_HIP \
-  dim3 nthreads_per_block(j_block_sz, i_block_sz, 1);
+  dim3 nthreads_per_block(POLY_GEMM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, 1);
 
 #define POLY_GEMM_NBLOCKS_HIP \
   dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(nj, j_block_sz)), \
@@ -49,6 +52,8 @@ constexpr size_t j_block_sz = 32;
   deallocHipDeviceData(C);
 
 
+template < size_t j_block_size, size_t i_block_size >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_gemm(Real_ptr C, Real_ptr A, Real_ptr B,
                           Real_type alpha, Real_type beta,
                           Index_type ni, Index_type nj, Index_type nk)
@@ -66,7 +71,8 @@ __global__ void poly_gemm(Real_ptr C, Real_ptr A, Real_ptr B,
   }
 }
 
-template< typename Lambda >
+template < size_t j_block_size, size_t i_block_size, typename Lambda >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_gemm_lam(Index_type ni, Index_type nj,
                               Lambda body)
 {
@@ -79,7 +85,8 @@ __global__ void poly_gemm_lam(Index_type ni, Index_type nj,
 }
 
 
-void POLYBENCH_GEMM::runHipVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_GEMM::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -93,9 +100,9 @@ void POLYBENCH_GEMM::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       POLY_GEMM_THREADS_PER_BLOCK_HIP;
-      POLY_GEMM_NBLOCKS_HIP; 
+      POLY_GEMM_NBLOCKS_HIP;
 
-      hipLaunchKernelGGL((poly_gemm), 
+      hipLaunchKernelGGL((poly_gemm<POLY_GEMM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
                          dim3(nblocks), dim3(nthreads_per_block), 0, 0,
                          C, A, B, alpha, beta,
                          ni, nj, nk);
@@ -114,7 +121,7 @@ void POLYBENCH_GEMM::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       POLY_GEMM_THREADS_PER_BLOCK_HIP;
-      POLY_GEMM_NBLOCKS_HIP; 
+      POLY_GEMM_NBLOCKS_HIP;
 
       auto poly_gemm_lambda = [=] __device__ (Index_type i, Index_type j) {
         POLYBENCH_GEMM_BODY1;
@@ -125,7 +132,7 @@ void POLYBENCH_GEMM::runHipVariant(VariantID vid)
         POLYBENCH_GEMM_BODY4;
       };
 
-      hipLaunchKernelGGL((poly_gemm_lam<decltype(poly_gemm_lambda)>), 
+      hipLaunchKernelGGL((poly_gemm_lam<POLY_GEMM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_gemm_lambda)>),
         dim3(nblocks), dim3(nthreads_per_block), 0, 0,
         ni, nj, poly_gemm_lambda);
       hipErrchk( hipGetLastError() );
@@ -195,11 +202,12 @@ void POLYBENCH_GEMM::runHipVariant(VariantID vid)
     POLYBENCH_GEMM_TEARDOWN_HIP;
 
   } else {
-      std::cout << "\n  POLYBENCH_GEMM : Unknown Hip variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_GEMM : Unknown Hip variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_GEMM, Hip)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_GEMM-OMP.cpp b/src/polybench/POLYBENCH_GEMM-OMP.cpp
index ece6d4b22..53bddc30c 100644
--- a/src/polybench/POLYBENCH_GEMM-OMP.cpp
+++ b/src/polybench/POLYBENCH_GEMM-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,13 +13,13 @@
 #include <iostream>
 
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
 
-void POLYBENCH_GEMM::runOpenMPVariant(VariantID vid)
+void POLYBENCH_GEMM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -70,7 +70,7 @@ void POLYBENCH_GEMM::runOpenMPVariant(VariantID vid)
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
         #pragma omp parallel for collapse(2)
-        for (Index_type i = 0; i < ni; ++i ) { 
+        for (Index_type i = 0; i < ni; ++i ) {
           for (Index_type j = 0; j < nj; ++j ) {
             POLYBENCH_GEMM_BODY1;
             poly_gemm_base_lam2(i, j);
@@ -123,7 +123,7 @@ void POLYBENCH_GEMM::runOpenMPVariant(VariantID vid)
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
         RAJA::kernel_param<EXEC_POL>(
-     
+
           RAJA::make_tuple( RAJA::RangeSegment{0, ni},
                             RAJA::RangeSegment{0, nj},
                             RAJA::RangeSegment{0, nk} ),
@@ -143,12 +143,12 @@ void POLYBENCH_GEMM::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  POLYBENCH_GEMM : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_GEMM : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp
index 63314c8b3..7bbf5132b 100644
--- a/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp
@@ -1,10 +1,10 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// 
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 #include "POLYBENCH_GEMM.hpp"
 
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
@@ -37,7 +37,7 @@ namespace polybench
   deallocOpenMPDeviceData(C, did);
 
 
-void POLYBENCH_GEMM::runOpenMPTargetVariant(VariantID vid)
+void POLYBENCH_GEMM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -103,11 +103,11 @@ void POLYBENCH_GEMM::runOpenMPTargetVariant(VariantID vid)
           [=] (Index_type i, Index_type j) {
             POLYBENCH_GEMM_BODY2_RAJA;
           },
-          [=] (Index_type i, Index_type j, Index_type k, 
+          [=] (Index_type i, Index_type j, Index_type k,
                Real_type& dot) {
             POLYBENCH_GEMM_BODY3_RAJA;
           },
-          [=] (Index_type i, Index_type j, 
+          [=] (Index_type i, Index_type j,
                Real_type& dot) {
             POLYBENCH_GEMM_BODY4_RAJA;
           }
@@ -119,7 +119,7 @@ void POLYBENCH_GEMM::runOpenMPTargetVariant(VariantID vid)
     POLYBENCH_GEMM_TEARDOWN_OMP_TARGET;
 
   } else {
-      std::cout << "\n  POLYBENCH_GEMM : Unknown OMP Target variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_GEMM : Unknown OMP Target variant id = " << vid << std::endl;
   }
 
 }
@@ -128,4 +128,4 @@ void POLYBENCH_GEMM::runOpenMPTargetVariant(VariantID vid)
 } // end namespace rajaperf
 
 #endif  // RAJA_ENABLE_TARGET_OPENMP
-  
+
diff --git a/src/polybench/POLYBENCH_GEMM-Seq.cpp b/src/polybench/POLYBENCH_GEMM-Seq.cpp
index fe42374fb..51a1f1127 100644
--- a/src/polybench/POLYBENCH_GEMM-Seq.cpp
+++ b/src/polybench/POLYBENCH_GEMM-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,13 +13,13 @@
 #include <iostream>
 
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
 
-void POLYBENCH_GEMM::runSeqVariant(VariantID vid)
+void POLYBENCH_GEMM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps= getRunReps();
 
@@ -32,7 +32,7 @@ void POLYBENCH_GEMM::runSeqVariant(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        for (Index_type i = 0; i < ni; ++i ) { 
+        for (Index_type i = 0; i < ni; ++i ) {
           for (Index_type j = 0; j < nj; ++j ) {
             POLYBENCH_GEMM_BODY1;
             POLYBENCH_GEMM_BODY2;
@@ -94,7 +94,7 @@ void POLYBENCH_GEMM::runSeqVariant(VariantID vid)
       auto poly_gemm_lam2 = [=](Index_type i, Index_type j) {
                                 POLYBENCH_GEMM_BODY2_RAJA;
                                };
-      auto poly_gemm_lam3 = [=](Index_type i, Index_type j, Index_type k, 
+      auto poly_gemm_lam3 = [=](Index_type i, Index_type j, Index_type k,
                                 Real_type& dot) {
                                 POLYBENCH_GEMM_BODY3_RAJA;
                                };
@@ -121,7 +121,7 @@ void POLYBENCH_GEMM::runSeqVariant(VariantID vid)
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
         RAJA::kernel_param<EXEC_POL>(
-     
+
           RAJA::make_tuple( RAJA::RangeSegment{0, ni},
                             RAJA::RangeSegment{0, nj},
                             RAJA::RangeSegment{0, nk} ),
@@ -142,7 +142,7 @@ void POLYBENCH_GEMM::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  POLYBENCH_GEMM : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_GEMM : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp
index a50ac09da..0ee1f41be 100644
--- a/src/polybench/POLYBENCH_GEMM.cpp
+++ b/src/polybench/POLYBENCH_GEMM.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -31,7 +31,7 @@ POLYBENCH_GEMM::POLYBENCH_GEMM(const RunParams& params)
   m_ni = std::sqrt( getTargetProblemSize() ) + 1;
   m_nj = m_ni;
   m_nk = nk_default;
-  
+
   m_alpha = 0.62;
   m_beta = 1.002;
 
@@ -76,7 +76,7 @@ POLYBENCH_GEMM::~POLYBENCH_GEMM()
 {
 }
 
-void POLYBENCH_GEMM::setUp(VariantID vid)
+void POLYBENCH_GEMM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   allocAndInitData(m_A, m_ni * m_nk, vid);
@@ -84,12 +84,12 @@ void POLYBENCH_GEMM::setUp(VariantID vid)
   allocAndInitDataConst(m_C, m_ni * m_nj, 0.0, vid);
 }
 
-void POLYBENCH_GEMM::updateChecksum(VariantID vid)
+void POLYBENCH_GEMM::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_C, m_ni * m_nj, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_C, m_ni * m_nj, checksum_scale_factor );
 }
 
-void POLYBENCH_GEMM::tearDown(VariantID vid)
+void POLYBENCH_GEMM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_A);
diff --git a/src/polybench/POLYBENCH_GEMM.hpp b/src/polybench/POLYBENCH_GEMM.hpp
index dd9e4a5a7..ae218397d 100644
--- a/src/polybench/POLYBENCH_GEMM.hpp
+++ b/src/polybench/POLYBENCH_GEMM.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -90,17 +90,28 @@ class POLYBENCH_GEMM : public KernelBase
 
   ~POLYBENCH_GEMM();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
+                                                         gpu_block_size::MultipleOf<32>>;
+
   Index_type m_ni;
   Index_type m_nj;
   Index_type m_nk;
diff --git a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp
index 8d2ddca87..652bbf761 100644
--- a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp
+++ b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,15 +22,16 @@ namespace polybench
 {
 
 //
-// Define thread block size for CUDA execution
+// Define thread block shape for CUDA execution
 //
-const size_t block_size = 256;
+#define j_block_sz (32)
+#define i_block_sz (block_size / j_block_sz)
 
-constexpr size_t i_block_sz = 8;
-constexpr size_t j_block_sz = 32;
+#define GEMVER_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \
+  j_block_sz, i_block_sz
 
 #define GEMVER_THREADS_PER_BLOCK_CUDA \
-  dim3 nthreads_per_block1(j_block_sz, i_block_sz, 1);
+  dim3 nthreads_per_block1(GEMVER_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA, 1);
 
 #define GEMVER_NBLOCKS_CUDA \
   dim3 nblocks1(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(n, j_block_sz)), \
@@ -62,36 +63,41 @@ constexpr size_t j_block_sz = 32;
   deallocCudaDeviceData(y); \
   deallocCudaDeviceData(z);
 
+template < size_t j_block_size, size_t i_block_size >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_gemmver_1(Real_ptr A,
                                Real_ptr u1, Real_ptr v1,
                                Real_ptr u2, Real_ptr v2,
                                Index_type n)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if (i < n && j < n) {
     POLYBENCH_GEMVER_BODY1;
   }
 }
 
-template< typename Lambda >
+template < size_t j_block_size, size_t i_block_size, typename Lambda >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_gemmver_1_lam(Index_type n, Lambda body)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if (i < n && j < n) {
     body(i, j);
   }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_gemmver_2(Real_ptr A,
                                Real_ptr x, Real_ptr y,
                                Real_type beta,
                                Index_type n)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < n) {
     POLYBENCH_GEMVER_BODY2;
     for (Index_type j = 0; j < n; ++j) {
@@ -101,21 +107,25 @@ __global__ void poly_gemmver_2(Real_ptr A,
   }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_gemmver_3(Real_ptr x, Real_ptr z,
                                Index_type n)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < n) {
     POLYBENCH_GEMVER_BODY5;
   }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_gemmver_4(Real_ptr A,
                                Real_ptr x, Real_ptr w,
                                Real_type alpha,
                                Index_type n)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < n) {
     POLYBENCH_GEMVER_BODY6;
     for (Index_type j = 0; j < n; ++j) {
@@ -125,17 +135,19 @@ __global__ void poly_gemmver_4(Real_ptr A,
   }
 }
 
-template< typename Lambda >
+template < size_t block_size, typename Lambda >
+__launch_bounds__(block_size)
 __global__ void poly_gemmver_234_lam(Index_type n, Lambda body)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < n) {
     body(i);
   }
 }
 
 
-void POLYBENCH_GEMVER::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -151,22 +163,23 @@ void POLYBENCH_GEMVER::runCudaVariant(VariantID vid)
       GEMVER_THREADS_PER_BLOCK_CUDA;
       GEMVER_NBLOCKS_CUDA;
 
-      poly_gemmver_1<<<nblocks1, nthreads_per_block1>>>(A, u1, v1, u2, v2,
+      poly_gemmver_1<GEMVER_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                    <<<nblocks1, nthreads_per_block1>>>(A, u1, v1, u2, v2,
                                                         n);
       cudaErrchk( cudaGetLastError() );
 
       size_t grid_size = RAJA_DIVIDE_CEILING_INT(n, block_size);
 
-      poly_gemmver_2<<<grid_size, block_size>>>(A, x, y,
+      poly_gemmver_2<block_size><<<grid_size, block_size>>>(A, x, y,
                                                 beta,
                                                 n);
       cudaErrchk( cudaGetLastError() );
 
-      poly_gemmver_3<<<grid_size, block_size>>>(x, z,
+      poly_gemmver_3<block_size><<<grid_size, block_size>>>(x, z,
                                                 n);
       cudaErrchk( cudaGetLastError() );
 
-      poly_gemmver_4<<<grid_size, block_size>>>(A, x, w,
+      poly_gemmver_4<block_size><<<grid_size, block_size>>>(A, x, w,
                                                 alpha,
                                                 n);
       cudaErrchk( cudaGetLastError() );
@@ -186,7 +199,8 @@ void POLYBENCH_GEMVER::runCudaVariant(VariantID vid)
       GEMVER_THREADS_PER_BLOCK_CUDA;
       GEMVER_NBLOCKS_CUDA;
 
-      poly_gemmver_1_lam<<<nblocks1, nthreads_per_block1>>>(n,
+      poly_gemmver_1_lam<GEMVER_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                        <<<nblocks1, nthreads_per_block1>>>(n,
         [=] __device__ (Index_type i, Index_type j) {
           POLYBENCH_GEMVER_BODY1;
         }
@@ -195,7 +209,7 @@ void POLYBENCH_GEMVER::runCudaVariant(VariantID vid)
 
       size_t grid_size = RAJA_DIVIDE_CEILING_INT(n, block_size);
 
-      poly_gemmver_234_lam<<<grid_size, block_size>>>(n,
+      poly_gemmver_234_lam<block_size><<<grid_size, block_size>>>(n,
         [=] __device__ (Index_type i) {
           POLYBENCH_GEMVER_BODY2;
           for (Index_type j = 0; j < n; ++j) {
@@ -206,14 +220,14 @@ void POLYBENCH_GEMVER::runCudaVariant(VariantID vid)
       );
       cudaErrchk( cudaGetLastError() );
 
-      poly_gemmver_234_lam<<<grid_size, block_size>>>(n,
+      poly_gemmver_234_lam<block_size><<<grid_size, block_size>>>(n,
         [=] __device__ (Index_type i) {
           POLYBENCH_GEMVER_BODY5;
         }
       );
       cudaErrchk( cudaGetLastError() );
 
-      poly_gemmver_234_lam<<<grid_size, block_size>>>(n,
+      poly_gemmver_234_lam<block_size><<<grid_size, block_size>>>(n,
         [=] __device__ (Index_type i) {
           POLYBENCH_GEMVER_BODY6;
           for (Index_type j = 0; j < n; ++j) {
@@ -250,7 +264,7 @@ void POLYBENCH_GEMVER::runCudaVariant(VariantID vid)
             >
           >
         >
-      >;       
+      >;
 
     using EXEC_POL24 =
       RAJA::KernelPolicy<
@@ -324,11 +338,12 @@ void POLYBENCH_GEMVER::runCudaVariant(VariantID vid)
     POLYBENCH_GEMVER_TEARDOWN_CUDA;
 
   } else {
-      std::cout << "\n  POLYBENCH_GEMVER : Unknown Cuda variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_GEMVER : Unknown Cuda variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_GEMVER, Cuda)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_GEMVER-Hip.cpp b/src/polybench/POLYBENCH_GEMVER-Hip.cpp
index 469f620a3..943958e31 100644
--- a/src/polybench/POLYBENCH_GEMVER-Hip.cpp
+++ b/src/polybench/POLYBENCH_GEMVER-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,15 +22,16 @@ namespace polybench
 {
 
 //
-// Define thread block size for Hip execution
+// Define thread block shape for Hip execution
 //
-const size_t block_size = 256;
+#define j_block_sz (32)
+#define i_block_sz (block_size / j_block_sz)
 
-constexpr size_t i_block_sz = 8;
-constexpr size_t j_block_sz = 32;
+#define GEMVER_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \
+  j_block_sz, i_block_sz
 
 #define GEMVER_THREADS_PER_BLOCK_HIP \
-  dim3 nthreads_per_block1(j_block_sz, i_block_sz, 1);
+  dim3 nthreads_per_block1(GEMVER_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, 1);
 
 #define GEMVER_NBLOCKS_HIP \
   dim3 nblocks1(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(n, j_block_sz)), \
@@ -62,36 +63,41 @@ constexpr size_t j_block_sz = 32;
   deallocHipDeviceData(y); \
   deallocHipDeviceData(z);
 
+template < size_t j_block_size, size_t i_block_size >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_gemmver_1(Real_ptr A,
                                Real_ptr u1, Real_ptr v1,
                                Real_ptr u2, Real_ptr v2,
                                Index_type n)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if (i < n && j < n) {
     POLYBENCH_GEMVER_BODY1;
   }
 }
 
-template< typename Lambda >
+template < size_t j_block_size, size_t i_block_size, typename Lambda >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_gemmver_1_lam(Index_type n, Lambda body)
 {
-  Index_type i = blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = blockIdx.x * j_block_size + threadIdx.x;
 
   if (i < n && j < n) {
     body(i, j);
   }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_gemmver_2(Real_ptr A,
                                Real_ptr x, Real_ptr y,
                                Real_type beta,
                                Index_type n)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < n) {
     POLYBENCH_GEMVER_BODY2;
     for (Index_type j = 0; j < n; ++j) {
@@ -101,21 +107,25 @@ __global__ void poly_gemmver_2(Real_ptr A,
   }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_gemmver_3(Real_ptr x, Real_ptr z,
                                Index_type n)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < n) {
     POLYBENCH_GEMVER_BODY5;
   }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_gemmver_4(Real_ptr A,
                                Real_ptr x, Real_ptr w,
                                Real_type alpha,
                                Index_type n)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < n) {
     POLYBENCH_GEMVER_BODY6;
     for (Index_type j = 0; j < n; ++j) {
@@ -125,17 +135,19 @@ __global__ void poly_gemmver_4(Real_ptr A,
   }
 }
 
-template< typename Lambda >
+template < size_t block_size, typename Lambda >
+__launch_bounds__(block_size)
 __global__ void poly_gemmver_234_lam(Index_type n, Lambda body)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < n) {
     body(i);
   }
 }
 
 
-void POLYBENCH_GEMVER::runHipVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -151,24 +163,24 @@ void POLYBENCH_GEMVER::runHipVariant(VariantID vid)
       GEMVER_THREADS_PER_BLOCK_HIP;
       GEMVER_NBLOCKS_HIP;
 
-      hipLaunchKernelGGL((poly_gemmver_1), 
+      hipLaunchKernelGGL((poly_gemmver_1<GEMVER_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
                          dim3(nblocks1), dim3(nthreads_per_block1), 0, 0,
                          A, u1, v1, u2, v2, n);
       hipErrchk( hipGetLastError() );
 
       size_t grid_size = RAJA_DIVIDE_CEILING_INT(m_n, block_size);
 
-      hipLaunchKernelGGL((poly_gemmver_2), 
+      hipLaunchKernelGGL((poly_gemmver_2<block_size>),
                          dim3(grid_size), dim3(block_size), 0, 0,
                          A, x, y, beta, n);
       hipErrchk( hipGetLastError() );
 
-      hipLaunchKernelGGL((poly_gemmver_3), 
+      hipLaunchKernelGGL((poly_gemmver_3<block_size>),
                          dim3(grid_size), dim3(block_size), 0, 0,
                          x, z, n);
       hipErrchk( hipGetLastError() );
 
-      hipLaunchKernelGGL((poly_gemmver_4), 
+      hipLaunchKernelGGL((poly_gemmver_4<block_size>),
                          dim3(grid_size), dim3(block_size), 0, 0,
                          A, x, w, alpha, n);
       hipErrchk( hipGetLastError() );
@@ -192,9 +204,9 @@ void POLYBENCH_GEMVER::runHipVariant(VariantID vid)
           POLYBENCH_GEMVER_BODY1;
       };
 
-      hipLaunchKernelGGL(poly_gemmver_1_lam<decltype(poly_gemmver_1_lambda)>,
+      hipLaunchKernelGGL((poly_gemmver_1_lam<GEMVER_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_gemmver_1_lambda)>),
                          dim3(nblocks1), dim3(nthreads_per_block1), 0, 0,
-                         n, poly_gemmver_1_lambda); 
+                         n, poly_gemmver_1_lambda);
       hipErrchk( hipGetLastError() );
 
       size_t grid_size = RAJA_DIVIDE_CEILING_INT(n, block_size);
@@ -207,7 +219,7 @@ void POLYBENCH_GEMVER::runHipVariant(VariantID vid)
           POLYBENCH_GEMVER_BODY4;
       };
 
-      hipLaunchKernelGGL(poly_gemmver_234_lam<decltype(poly_gemmver_2_lambda)>,
+      hipLaunchKernelGGL((poly_gemmver_234_lam<block_size, decltype(poly_gemmver_2_lambda)>),
         dim3(grid_size), dim3(block_size), 0, 0,
         n, poly_gemmver_2_lambda);
       hipErrchk( hipGetLastError() );
@@ -216,7 +228,7 @@ void POLYBENCH_GEMVER::runHipVariant(VariantID vid)
           POLYBENCH_GEMVER_BODY5;
       };
 
-      hipLaunchKernelGGL(poly_gemmver_234_lam<decltype(poly_gemmver_3_lambda)>,
+      hipLaunchKernelGGL((poly_gemmver_234_lam<block_size, decltype(poly_gemmver_3_lambda)>),
         dim3(grid_size), dim3(block_size), 0, 0,
         n, poly_gemmver_3_lambda);
       hipErrchk( hipGetLastError() );
@@ -229,7 +241,7 @@ void POLYBENCH_GEMVER::runHipVariant(VariantID vid)
           POLYBENCH_GEMVER_BODY8;
       };
 
-      hipLaunchKernelGGL(poly_gemmver_234_lam<decltype(poly_gemmver_4_lambda)>,
+      hipLaunchKernelGGL((poly_gemmver_234_lam<block_size, decltype(poly_gemmver_4_lambda)>),
         dim3(grid_size), dim3(block_size), 0, 0,
         n, poly_gemmver_4_lambda);
       hipErrchk( hipGetLastError() );
@@ -260,7 +272,7 @@ void POLYBENCH_GEMVER::runHipVariant(VariantID vid)
             >
           >
         >
-      >;      
+      >;
 
     using EXEC_POL24 =
       RAJA::KernelPolicy<
@@ -334,11 +346,12 @@ void POLYBENCH_GEMVER::runHipVariant(VariantID vid)
     POLYBENCH_GEMVER_TEARDOWN_HIP;
 
   } else {
-      std::cout << "\n  POLYBENCH_GEMVER : Unknown Hip variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_GEMVER : Unknown Hip variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_GEMVER, Hip)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_GEMVER-OMP.cpp b/src/polybench/POLYBENCH_GEMVER-OMP.cpp
index ba3ad5457..18013e3f7 100644
--- a/src/polybench/POLYBENCH_GEMVER-OMP.cpp
+++ b/src/polybench/POLYBENCH_GEMVER-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -14,13 +14,13 @@
 #include <cstring>
 
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
 
-void POLYBENCH_GEMVER::runOpenMPVariant(VariantID vid)
+void POLYBENCH_GEMVER::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -49,14 +49,14 @@ void POLYBENCH_GEMVER::runOpenMPVariant(VariantID vid)
             POLYBENCH_GEMVER_BODY3;
           }
           POLYBENCH_GEMVER_BODY4;
-        } 
+        }
 
-        #pragma omp parallel for  
+        #pragma omp parallel for
         for (Index_type i = 0; i < n; i++ ) {
           POLYBENCH_GEMVER_BODY5;
         }
 
-        #pragma omp parallel for  
+        #pragma omp parallel for
         for (Index_type i = 0; i < n; i++ ) {
           POLYBENCH_GEMVER_BODY6;
           for (Index_type j = 0; j < n; j++) {
@@ -215,7 +215,7 @@ void POLYBENCH_GEMVER::runOpenMPVariant(VariantID vid)
           poly_gemver_lam7,
           poly_gemver_lam8
 
-        ); 
+        );
 
       }
       stopTimer();
@@ -224,12 +224,12 @@ void POLYBENCH_GEMVER::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  POLYBENCH_GEMVER : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_GEMVER : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp
index 7b13712f1..c031bdf04 100644
--- a/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -38,7 +38,7 @@ namespace polybench
   allocAndInitOpenMPDeviceData(w, m_w, m_n, did, hid); \
   allocAndInitOpenMPDeviceData(x, m_x, m_n, did, hid); \
   allocAndInitOpenMPDeviceData(y, m_y, m_n, did, hid); \
-  allocAndInitOpenMPDeviceData(z, m_z, m_n, did, hid); 
+  allocAndInitOpenMPDeviceData(z, m_z, m_n, did, hid);
 
 #define POLYBENCH_GEMVER_DATA_TEARDOWN_OMP_TARGET \
   getOpenMPDeviceData(m_w, w, m_n, hid, did); \
@@ -50,11 +50,11 @@ namespace polybench
   deallocOpenMPDeviceData(w, did); \
   deallocOpenMPDeviceData(x, did); \
   deallocOpenMPDeviceData(y, did); \
-  deallocOpenMPDeviceData(z, did); 
+  deallocOpenMPDeviceData(z, did);
 
-  
 
-void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid)
+
+void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -77,7 +77,7 @@ void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid)
 
       #pragma omp target is_device_ptr(A,x,y) device( did )
       #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
-      for (Index_type i = 0; i < n; i++) { 
+      for (Index_type i = 0; i < n; i++) {
         POLYBENCH_GEMVER_BODY2;
         for (Index_type j = 0; j < n; j++) {
           POLYBENCH_GEMVER_BODY3;
@@ -86,7 +86,7 @@ void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid)
       }
 
       #pragma omp target is_device_ptr(x,z) device( did )
-      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) 
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
       for (Index_type i = 0; i < n; i++) {
         POLYBENCH_GEMVER_BODY5;
       }
@@ -102,7 +102,7 @@ void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid)
       }
 
     } // end run_reps
-    stopTimer(); 
+    stopTimer();
 
     POLYBENCH_GEMVER_DATA_TEARDOWN_OMP_TARGET;
 
@@ -187,7 +187,7 @@ void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid)
     POLYBENCH_GEMVER_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  POLYBENCH_GEMVER : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  POLYBENCH_GEMVER : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/polybench/POLYBENCH_GEMVER-Seq.cpp b/src/polybench/POLYBENCH_GEMVER-Seq.cpp
index 9498d1355..eeee6f0ec 100644
--- a/src/polybench/POLYBENCH_GEMVER-Seq.cpp
+++ b/src/polybench/POLYBENCH_GEMVER-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -14,13 +14,13 @@
 #include <cstring>
 
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
 
-void POLYBENCH_GEMVER::runSeqVariant(VariantID vid)
+void POLYBENCH_GEMVER::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -39,7 +39,7 @@ void POLYBENCH_GEMVER::runSeqVariant(VariantID vid)
           }
         }
 
-        for (Index_type i = 0; i < n; i++ ) { 
+        for (Index_type i = 0; i < n; i++ ) {
           POLYBENCH_GEMVER_BODY2;
           for (Index_type j = 0; j < n; j++) {
             POLYBENCH_GEMVER_BODY3;
@@ -47,11 +47,11 @@ void POLYBENCH_GEMVER::runSeqVariant(VariantID vid)
           POLYBENCH_GEMVER_BODY4;
         }
 
-        for (Index_type i = 0; i < n; i++ ) { 
+        for (Index_type i = 0; i < n; i++ ) {
           POLYBENCH_GEMVER_BODY5;
         }
 
-        for (Index_type i = 0; i < n; i++ ) { 
+        for (Index_type i = 0; i < n; i++ ) {
           POLYBENCH_GEMVER_BODY6;
           for (Index_type j = 0; j < n; j++) {
             POLYBENCH_GEMVER_BODY7;
@@ -71,7 +71,7 @@ void POLYBENCH_GEMVER::runSeqVariant(VariantID vid)
       auto poly_gemver_base_lam1 = [=](Index_type i, Index_type j) {
                                      POLYBENCH_GEMVER_BODY1;
                                    };
-      auto poly_gemver_base_lam3 = [=](Index_type i, Index_type j, 
+      auto poly_gemver_base_lam3 = [=](Index_type i, Index_type j,
                                        Real_type &dot) {
                                      POLYBENCH_GEMVER_BODY3;
                                    };
@@ -81,7 +81,7 @@ void POLYBENCH_GEMVER::runSeqVariant(VariantID vid)
       auto poly_gemver_base_lam5 = [=](Index_type i) {
                                      POLYBENCH_GEMVER_BODY5;
                                    };
-      auto poly_gemver_base_lam7 = [=](Index_type i, Index_type j, 
+      auto poly_gemver_base_lam7 = [=](Index_type i, Index_type j,
                                        Real_type &dot) {
                                      POLYBENCH_GEMVER_BODY7;
                                     };
@@ -182,8 +182,8 @@ void POLYBENCH_GEMVER::runSeqVariant(VariantID vid)
                                                   RAJA::RangeSegment{0, n}),
           poly_gemver_lam1
         );
-        
-        RAJA::kernel_param<EXEC_POL24>( 
+
+        RAJA::kernel_param<EXEC_POL24>(
           RAJA::make_tuple(RAJA::RangeSegment{0, n},
                            RAJA::RangeSegment{0, n}),
           RAJA::tuple<Real_type>{0.0},
@@ -192,12 +192,12 @@ void POLYBENCH_GEMVER::runSeqVariant(VariantID vid)
           poly_gemver_lam3,
           poly_gemver_lam4
         );
-        
-        RAJA::forall<EXEC_POL3> (RAJA::RangeSegment{0, n}, 
+
+        RAJA::forall<EXEC_POL3> (RAJA::RangeSegment{0, n},
           poly_gemver_lam5
         );
 
-        RAJA::kernel_param<EXEC_POL24>( 
+        RAJA::kernel_param<EXEC_POL24>(
           RAJA::make_tuple(RAJA::RangeSegment{0, n},
                            RAJA::RangeSegment{0, n}),
           RAJA::tuple<Real_type>{0.0},
@@ -207,16 +207,16 @@ void POLYBENCH_GEMVER::runSeqVariant(VariantID vid)
           poly_gemver_lam8
 
         );
-        
+
       }
       stopTimer();
-      
+
       break;
     }
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  POLYBENCH_GEMVER : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_GEMVER : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp
index fce83907a..24a3f3d1b 100644
--- a/src/polybench/POLYBENCH_GEMVER.cpp
+++ b/src/polybench/POLYBENCH_GEMVER.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -85,7 +85,7 @@ POLYBENCH_GEMVER::~POLYBENCH_GEMVER()
 {
 }
 
-void POLYBENCH_GEMVER::setUp(VariantID vid)
+void POLYBENCH_GEMVER::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
 
@@ -100,12 +100,12 @@ void POLYBENCH_GEMVER::setUp(VariantID vid)
   allocAndInitData(m_z, m_n, vid);
 }
 
-void POLYBENCH_GEMVER::updateChecksum(VariantID vid)
+void POLYBENCH_GEMVER::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_w, m_n, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_w, m_n, checksum_scale_factor );
 }
 
-void POLYBENCH_GEMVER::tearDown(VariantID vid)
+void POLYBENCH_GEMVER::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_A);
diff --git a/src/polybench/POLYBENCH_GEMVER.hpp b/src/polybench/POLYBENCH_GEMVER.hpp
index 919f18e5c..80c96fa94 100644
--- a/src/polybench/POLYBENCH_GEMVER.hpp
+++ b/src/polybench/POLYBENCH_GEMVER.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -15,8 +15,8 @@
 ///   }
 /// }
 ///
-/// Note: this part of the kernel is modified to avoid 
-///       excessively large checksums   
+/// Note: this part of the kernel is modified to avoid
+///       excessively large checksums
 /// for (Index_type i = 0; i < N; i++) {
 ///   Real_type dot = 0.0;
 ///   for (Index_type j = 0; j < N; j++) {
@@ -143,17 +143,28 @@ class POLYBENCH_GEMVER : public KernelBase
 
   ~POLYBENCH_GEMVER();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
+                                                         gpu_block_size::MultipleOf<32>>;
+
   Index_type m_n;
   Real_type m_alpha;
   Real_type m_beta;
diff --git a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp
index 3fdac4fd8..535e24efa 100644
--- a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp
+++ b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,11 +21,6 @@ namespace rajaperf
 namespace polybench
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
 #define POLYBENCH_GESUMMV_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(x, m_x, N); \
   allocAndInitCudaDeviceData(y, m_y, N); \
@@ -41,12 +36,14 @@ namespace polybench
   deallocCudaDeviceData(B);
 
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_gesummv(Real_ptr x, Real_ptr y,
                              Real_ptr A, Real_ptr B,
                              Real_type alpha, Real_type beta,
                              Index_type N)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
 
    if (i < N) {
      POLYBENCH_GESUMMV_BODY1;
@@ -58,7 +55,8 @@ __global__ void poly_gesummv(Real_ptr x, Real_ptr y,
 }
 
 
-void POLYBENCH_GESUMMV::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_GESUMMV::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -73,7 +71,7 @@ void POLYBENCH_GESUMMV::runCudaVariant(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size);
 
-      poly_gesummv<<<grid_size, block_size>>>(x, y,
+      poly_gesummv<block_size><<<grid_size, block_size>>>(x, y,
                                               A, B,
                                               alpha, beta,
                                               N);
@@ -135,11 +133,12 @@ void POLYBENCH_GESUMMV::runCudaVariant(VariantID vid)
     POLYBENCH_GESUMMV_TEARDOWN_CUDA;
 
   } else {
-      std::cout << "\n  POLYBENCH_GESUMMV : Unknown Cuda variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_GESUMMV : Unknown Cuda variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_GESUMMV, Cuda)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp
index 1fec5379b..ee39f9c6e 100644
--- a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp
+++ b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,11 +21,6 @@ namespace rajaperf
 namespace polybench
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
 #define POLYBENCH_GESUMMV_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(x, m_x, N); \
   allocAndInitHipDeviceData(y, m_y, N); \
@@ -41,12 +36,14 @@ namespace polybench
   deallocHipDeviceData(B);
 
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_gesummv(Real_ptr x, Real_ptr y,
                              Real_ptr A, Real_ptr B,
                              Real_type alpha, Real_type beta,
                              Index_type N)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
 
    if (i < N) {
      POLYBENCH_GESUMMV_BODY1;
@@ -58,7 +55,8 @@ __global__ void poly_gesummv(Real_ptr x, Real_ptr y,
 }
 
 
-void POLYBENCH_GESUMMV::runHipVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_GESUMMV::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -73,7 +71,7 @@ void POLYBENCH_GESUMMV::runHipVariant(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size);
 
-      hipLaunchKernelGGL((poly_gesummv), 
+      hipLaunchKernelGGL((poly_gesummv<block_size>),
                          dim3(grid_size), dim3(block_size),0,0,
                          x, y,
                          A, B,
@@ -137,11 +135,12 @@ void POLYBENCH_GESUMMV::runHipVariant(VariantID vid)
     POLYBENCH_GESUMMV_TEARDOWN_HIP;
 
   } else {
-      std::cout << "\n  POLYBENCH_GESUMMV : Unknown Hip variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_GESUMMV : Unknown Hip variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_GESUMMV, Hip)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_GESUMMV-OMP.cpp b/src/polybench/POLYBENCH_GESUMMV-OMP.cpp
index 8e46f7691..830bb73bf 100644
--- a/src/polybench/POLYBENCH_GESUMMV-OMP.cpp
+++ b/src/polybench/POLYBENCH_GESUMMV-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,13 +13,13 @@
 #include <iostream>
 
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
- 
-void POLYBENCH_GESUMMV::runOpenMPVariant(VariantID vid)
+
+void POLYBENCH_GESUMMV::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -111,7 +111,7 @@ void POLYBENCH_GESUMMV::runOpenMPVariant(VariantID vid)
         RAJA::kernel_param<EXEC_POL>(
           RAJA::make_tuple( RAJA::RangeSegment{0, N},
                             RAJA::RangeSegment{0, N} ),
-          RAJA::make_tuple(static_cast<Real_type>(0.0), 
+          RAJA::make_tuple(static_cast<Real_type>(0.0),
                            static_cast<Real_type>(0.0)),
 
           poly_gesummv_lam1,
@@ -126,12 +126,12 @@ void POLYBENCH_GESUMMV::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  POLYBENCH_GESUMMV : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_GESUMMV : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp b/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp
index 5fd39ea33..c4c535bf6 100644
--- a/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp
@@ -1,10 +1,10 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// 
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 #include "POLYBENCH_GESUMMV.hpp"
 
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
@@ -44,7 +44,7 @@ namespace polybench
   deallocOpenMPDeviceData(B, did);
 
 
-void POLYBENCH_GESUMMV::runOpenMPTargetVariant(VariantID vid)
+void POLYBENCH_GESUMMV::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -118,7 +118,7 @@ void POLYBENCH_GESUMMV::runOpenMPTargetVariant(VariantID vid)
     POLYBENCH_GESUMMV_TEARDOWN_OMP_TARGET;
 
   } else {
-      std::cout << "\n  POLYBENCH_GESUMMV : Unknown OMP Target variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_GESUMMV : Unknown OMP Target variant id = " << vid << std::endl;
   }
 
 }
@@ -127,4 +127,4 @@ void POLYBENCH_GESUMMV::runOpenMPTargetVariant(VariantID vid)
 } // end namespace rajaperf
 
 #endif  // RAJA_ENABLE_TARGET_OPENMP
-  
+
diff --git a/src/polybench/POLYBENCH_GESUMMV-Seq.cpp b/src/polybench/POLYBENCH_GESUMMV-Seq.cpp
index c769da219..c65897e5d 100644
--- a/src/polybench/POLYBENCH_GESUMMV-Seq.cpp
+++ b/src/polybench/POLYBENCH_GESUMMV-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,12 +13,12 @@
 #include <iostream>
 
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
-void POLYBENCH_GESUMMV::runSeqVariant(VariantID vid)
+void POLYBENCH_GESUMMV::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps= getRunReps();
 
@@ -31,7 +31,7 @@ void POLYBENCH_GESUMMV::runSeqVariant(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        for (Index_type i = 0; i < N; ++i ) { 
+        for (Index_type i = 0; i < N; ++i ) {
           POLYBENCH_GESUMMV_BODY1;
           for (Index_type j = 0; j < N; ++j ) {
             POLYBENCH_GESUMMV_BODY2;
@@ -49,7 +49,7 @@ void POLYBENCH_GESUMMV::runSeqVariant(VariantID vid)
 #if defined(RUN_RAJA_SEQ)
     case Lambda_Seq : {
 
-      auto poly_gesummv_base_lam2 = [=](Index_type i, Index_type j, 
+      auto poly_gesummv_base_lam2 = [=](Index_type i, Index_type j,
                                         Real_type& tmpdot, Real_type& ydot) {
                                       POLYBENCH_GESUMMV_BODY2;
                                     };
@@ -82,7 +82,7 @@ void POLYBENCH_GESUMMV::runSeqVariant(VariantID vid)
       auto poly_gesummv_lam1 = [=](Real_type& tmpdot, Real_type& ydot) {
                                    POLYBENCH_GESUMMV_BODY1_RAJA;
                                   };
-      auto poly_gesummv_lam2 = [=](Index_type i, Index_type j, 
+      auto poly_gesummv_lam2 = [=](Index_type i, Index_type j,
                                    Real_type& tmpdot, Real_type& ydot) {
                                    POLYBENCH_GESUMMV_BODY2_RAJA;
                                   };
@@ -108,7 +108,7 @@ void POLYBENCH_GESUMMV::runSeqVariant(VariantID vid)
         RAJA::kernel_param<EXEC_POL>(
           RAJA::make_tuple( RAJA::RangeSegment{0, N},
                             RAJA::RangeSegment{0, N} ),
-          RAJA::make_tuple(static_cast<Real_type>(0.0), 
+          RAJA::make_tuple(static_cast<Real_type>(0.0),
                            static_cast<Real_type>(0.0)),
 
           poly_gesummv_lam1,
@@ -124,7 +124,7 @@ void POLYBENCH_GESUMMV::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  POLYBENCH_GESUMMV : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_GESUMMV : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp
index 39cb94510..eb527af27 100644
--- a/src/polybench/POLYBENCH_GESUMMV.cpp
+++ b/src/polybench/POLYBENCH_GESUMMV.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -24,7 +24,7 @@ POLYBENCH_GESUMMV::POLYBENCH_GESUMMV(const RunParams& params)
   Index_type N_default = 1000;
 
   setDefaultProblemSize( N_default * N_default );
-  setDefaultReps(120); 
+  setDefaultReps(120);
 
   m_N = std::sqrt( getTargetProblemSize() ) + 1;
 
@@ -65,7 +65,7 @@ POLYBENCH_GESUMMV::~POLYBENCH_GESUMMV()
 {
 }
 
-void POLYBENCH_GESUMMV::setUp(VariantID vid)
+void POLYBENCH_GESUMMV::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   allocAndInitData(m_x, m_N, vid);
@@ -74,12 +74,12 @@ void POLYBENCH_GESUMMV::setUp(VariantID vid)
   allocAndInitData(m_B, m_N * m_N, vid);
 }
 
-void POLYBENCH_GESUMMV::updateChecksum(VariantID vid)
+void POLYBENCH_GESUMMV::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_y, m_N);
+  checksum[vid][tune_idx] += calcChecksum(m_y, m_N);
 }
 
-void POLYBENCH_GESUMMV::tearDown(VariantID vid)
+void POLYBENCH_GESUMMV::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_x);
diff --git a/src/polybench/POLYBENCH_GESUMMV.hpp b/src/polybench/POLYBENCH_GESUMMV.hpp
index c8cc9e191..c8f71ee84 100644
--- a/src/polybench/POLYBENCH_GESUMMV.hpp
+++ b/src/polybench/POLYBENCH_GESUMMV.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -89,17 +89,27 @@ class POLYBENCH_GESUMMV : public KernelBase
 
   ~POLYBENCH_GESUMMV();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Index_type m_N;
 
   Real_type m_alpha;
diff --git a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp
index a757a5e0e..ce6e7769e 100644
--- a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp
+++ b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,14 +22,17 @@ namespace polybench
 {
 
   //
-  // Define thread block size for CUDA execution
+  // Define thread block shape for CUDA execution
   //
-  constexpr size_t i_block_sz = 1;
-  constexpr size_t j_block_sz = 8;
-  constexpr size_t k_block_sz = 32;
+#define k_block_sz (32)
+#define j_block_sz (block_size / k_block_sz)
+#define i_block_sz (1)
+
+#define HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \
+  k_block_sz, j_block_sz, i_block_sz
 
 #define HEAT_3D_THREADS_PER_BLOCK_CUDA \
-  dim3 nthreads_per_block(k_block_sz, j_block_sz, i_block_sz);
+  dim3 nthreads_per_block(HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA);
 
 #define HEAT_3D_NBLOCKS_CUDA \
   dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N-2, k_block_sz)), \
@@ -39,7 +42,8 @@ namespace polybench
 
 #define POLYBENCH_HEAT_3D_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(A, m_Ainit, m_N*m_N*m_N); \
-  allocAndInitCudaDeviceData(B, m_Binit, m_N*m_N*m_N);
+  allocAndInitCudaDeviceData(B, m_Binit, m_N*m_N*m_N); \
+  static_assert(k_block_sz*j_block_sz*i_block_sz == block_size, "Invalid block_size");
 
 
 #define POLYBENCH_HEAT_3D_TEARDOWN_CUDA \
@@ -49,34 +53,39 @@ namespace polybench
   deallocCudaDeviceData(B);
 
 
+template < size_t k_block_size, size_t j_block_size, size_t i_block_size >
+__launch_bounds__(k_block_size*j_block_size*i_block_size)
 __global__ void poly_heat_3D_1(Real_ptr A, Real_ptr B, Index_type N)
 {
    Index_type i = 1 + blockIdx.z;
-   Index_type j = 1 + blockIdx.y * blockDim.y + threadIdx.y;
-   Index_type k = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type j = 1 + blockIdx.y * j_block_size + threadIdx.y;
+   Index_type k = 1 + blockIdx.x * k_block_size + threadIdx.x;
 
    if (i < N-1 && j < N-1 && k < N-1) {
      POLYBENCH_HEAT_3D_BODY1;
    }
 }
 
+template < size_t k_block_size, size_t j_block_size, size_t i_block_size >
+__launch_bounds__(k_block_size*j_block_size*i_block_size)
 __global__ void poly_heat_3D_2(Real_ptr A, Real_ptr B, Index_type N)
 {
    Index_type i = 1 + blockIdx.z;
-   Index_type j = 1 + blockIdx.y * blockDim.y + threadIdx.y;
-   Index_type k = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type j = 1 + blockIdx.y * j_block_size + threadIdx.y;
+   Index_type k = 1 + blockIdx.x * k_block_size + threadIdx.x;
 
    if (i < N-1 && j < N-1 && k < N-1) {
      POLYBENCH_HEAT_3D_BODY2;
    }
 }
 
-template< typename Lambda >
+template< size_t k_block_size, size_t j_block_size, size_t i_block_size, typename Lambda >
+__launch_bounds__(k_block_size*j_block_size*i_block_size)
 __global__ void poly_heat_3D_lam(Index_type N, Lambda body)
 {
    Index_type i = 1 + blockIdx.z;
-   Index_type j = 1 + blockIdx.y * blockDim.y + threadIdx.y;
-   Index_type k = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type j = 1 + blockIdx.y * j_block_size + threadIdx.y;
+   Index_type k = 1 + blockIdx.x * k_block_size + threadIdx.x;
 
    if (i < N-1 && j < N-1 && k < N-1) {
      body(i, j, k);
@@ -84,7 +93,8 @@ __global__ void poly_heat_3D_lam(Index_type N, Lambda body)
 }
 
 
-void POLYBENCH_HEAT_3D::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -102,10 +112,12 @@ void POLYBENCH_HEAT_3D::runCudaVariant(VariantID vid)
         HEAT_3D_THREADS_PER_BLOCK_CUDA;
         HEAT_3D_NBLOCKS_CUDA;
 
-        poly_heat_3D_1<<<nblocks, nthreads_per_block>>>(A, B, N);
+        poly_heat_3D_1<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+            <<<nblocks, nthreads_per_block>>>(A, B, N);
         cudaErrchk( cudaGetLastError() );
 
-        poly_heat_3D_2<<<nblocks, nthreads_per_block>>>(A, B, N);
+        poly_heat_3D_2<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+            <<<nblocks, nthreads_per_block>>>(A, B, N);
         cudaErrchk( cudaGetLastError() );
 
       }
@@ -127,14 +139,16 @@ void POLYBENCH_HEAT_3D::runCudaVariant(VariantID vid)
         HEAT_3D_THREADS_PER_BLOCK_CUDA;
         HEAT_3D_NBLOCKS_CUDA;
 
-        poly_heat_3D_lam<<<nblocks, nthreads_per_block>>>(N,
+        poly_heat_3D_lam<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+            <<<nblocks, nthreads_per_block>>>(N,
           [=] __device__ (Index_type i, Index_type j, Index_type k) {
             POLYBENCH_HEAT_3D_BODY1;
           }
         );
         cudaErrchk( cudaGetLastError() );
 
-        poly_heat_3D_lam<<<nblocks, nthreads_per_block>>>(N,
+        poly_heat_3D_lam<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+            <<<nblocks, nthreads_per_block>>>(N,
           [=] __device__ (Index_type i, Index_type j, Index_type k) {
             POLYBENCH_HEAT_3D_BODY2;
           }
@@ -159,11 +173,11 @@ void POLYBENCH_HEAT_3D::runCudaVariant(VariantID vid)
         RAJA::statement::CudaKernelFixedAsync<j_block_sz * k_block_sz,
           RAJA::statement::Tile<1, RAJA::tile_fixed<j_block_sz>,
                                    RAJA::cuda_block_y_direct,
-            RAJA::statement::Tile<0, RAJA::tile_fixed<k_block_sz>,
+            RAJA::statement::Tile<2, RAJA::tile_fixed<k_block_sz>,
                                      RAJA::cuda_block_x_direct,
-              RAJA::statement::For<2, RAJA::cuda_block_z_direct,      // i
+              RAJA::statement::For<0, RAJA::cuda_block_z_direct,      // i
                 RAJA::statement::For<1, RAJA::cuda_thread_y_direct,   // j
-                  RAJA::statement::For<0, RAJA::cuda_thread_x_direct, // k
+                  RAJA::statement::For<2, RAJA::cuda_thread_x_direct, // k
                     RAJA::statement::Lambda<0>
                   >
                 >
@@ -203,11 +217,12 @@ void POLYBENCH_HEAT_3D::runCudaVariant(VariantID vid)
     POLYBENCH_HEAT_3D_TEARDOWN_CUDA;
 
   } else {
-      std::cout << "\n  POLYBENCH_HEAT_3D : Unknown Cuda variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_HEAT_3D : Unknown Cuda variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_HEAT_3D, Cuda)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp
index 545b70368..00e68aebd 100644
--- a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp
+++ b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,14 +22,17 @@ namespace polybench
 {
 
   //
-  // Define thread block size for Hip execution
+  // Define thread block shape for Hip execution
   //
-  constexpr size_t i_block_sz = 1;
-  constexpr size_t j_block_sz = 8;
-  constexpr size_t k_block_sz = 32;
+#define k_block_sz (32)
+#define j_block_sz (block_size / k_block_sz)
+#define i_block_sz (1)
+
+#define HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \
+  k_block_sz, j_block_sz, i_block_sz
 
 #define HEAT_3D_THREADS_PER_BLOCK_HIP \
-  dim3 nthreads_per_block(k_block_sz, j_block_sz, i_block_sz);
+  dim3 nthreads_per_block(HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP);
 
 #define HEAT_3D_NBLOCKS_HIP \
   dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N-2, k_block_sz)), \
@@ -39,7 +42,8 @@ namespace polybench
 
 #define POLYBENCH_HEAT_3D_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(A, m_Ainit, m_N*m_N*m_N); \
-  allocAndInitHipDeviceData(B, m_Binit, m_N*m_N*m_N);
+  allocAndInitHipDeviceData(B, m_Binit, m_N*m_N*m_N); \
+  static_assert(k_block_sz*j_block_sz*i_block_sz == block_size, "Invalid block_size");
 
 
 #define POLYBENCH_HEAT_3D_TEARDOWN_HIP \
@@ -49,34 +53,39 @@ namespace polybench
   deallocHipDeviceData(B);
 
 
+template < size_t k_block_size, size_t j_block_size, size_t i_block_size >
+__launch_bounds__(k_block_size*j_block_size*i_block_size)
 __global__ void poly_heat_3D_1(Real_ptr A, Real_ptr B, Index_type N)
 {
    Index_type i = 1 + blockIdx.z;
-   Index_type j = 1 + blockIdx.y * blockDim.y + threadIdx.y;
-   Index_type k = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type j = 1 + blockIdx.y * j_block_size + threadIdx.y;
+   Index_type k = 1 + blockIdx.x * k_block_size + threadIdx.x;
 
    if (i < N-1 && j < N-1 && k < N-1) {
      POLYBENCH_HEAT_3D_BODY1;
    }
 }
 
+template < size_t k_block_size, size_t j_block_size, size_t i_block_size >
+__launch_bounds__(k_block_size*j_block_size*i_block_size)
 __global__ void poly_heat_3D_2(Real_ptr A, Real_ptr B, Index_type N)
 {
    Index_type i = 1 + blockIdx.z;
-   Index_type j = 1 + blockIdx.y * blockDim.y + threadIdx.y;
-   Index_type k = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type j = 1 + blockIdx.y * j_block_size + threadIdx.y;
+   Index_type k = 1 + blockIdx.x * k_block_size + threadIdx.x;
 
    if (i < N-1 && j < N-1 && k < N-1) {
      POLYBENCH_HEAT_3D_BODY2;
    }
 }
 
-template< typename Lambda >
+template< size_t k_block_size, size_t j_block_size, size_t i_block_size, typename Lambda >
+__launch_bounds__(k_block_size*j_block_size*i_block_size)
 __global__ void poly_heat_3D_lam(Index_type N, Lambda body)
 {
    Index_type i = 1 + blockIdx.z;
-   Index_type j = 1 + blockIdx.y * blockDim.y + threadIdx.y;
-   Index_type k = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type j = 1 + blockIdx.y * j_block_size + threadIdx.y;
+   Index_type k = 1 + blockIdx.x * k_block_size + threadIdx.x;
 
    if (i < N-1 && j < N-1 && k < N-1) {
      body(i, j, k);
@@ -84,7 +93,8 @@ __global__ void poly_heat_3D_lam(Index_type N, Lambda body)
 }
 
 
-void POLYBENCH_HEAT_3D::runHipVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -102,13 +112,13 @@ void POLYBENCH_HEAT_3D::runHipVariant(VariantID vid)
         HEAT_3D_THREADS_PER_BLOCK_HIP;
         HEAT_3D_NBLOCKS_HIP;
 
-        hipLaunchKernelGGL((poly_heat_3D_1), 
+        hipLaunchKernelGGL((poly_heat_3D_1<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
                            dim3(nblocks), dim3(nthreads_per_block), 0, 0,
                            A, B, N);
         hipErrchk( hipGetLastError() );
 
-        hipLaunchKernelGGL((poly_heat_3D_2),
-                           dim3(nblocks), dim3(nthreads_per_block), 0, 0, 
+        hipLaunchKernelGGL((poly_heat_3D_2<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+                           dim3(nblocks), dim3(nthreads_per_block), 0, 0,
                            A, B, N);
         hipErrchk( hipGetLastError() );
 
@@ -140,12 +150,14 @@ void POLYBENCH_HEAT_3D::runHipVariant(VariantID vid)
           POLYBENCH_HEAT_3D_BODY2;
         };
 
-        hipLaunchKernelGGL((poly_heat_3D_lam<decltype(poly_heat_3D_1_lambda)>),
+        hipLaunchKernelGGL((poly_heat_3D_lam<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
+                                             decltype(poly_heat_3D_1_lambda)>),
                            dim3(nblocks), dim3(nthreads_per_block), 0, 0,
                            N, poly_heat_3D_1_lambda);
         hipErrchk( hipGetLastError() );
 
-        hipLaunchKernelGGL((poly_heat_3D_lam<decltype(poly_heat_3D_2_lambda)>),
+        hipLaunchKernelGGL((poly_heat_3D_lam<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
+                                             decltype(poly_heat_3D_2_lambda)>),
                            dim3(nblocks), dim3(nthreads_per_block), 0, 0,
                            N, poly_heat_3D_2_lambda);
         hipErrchk( hipGetLastError() );
@@ -168,11 +180,11 @@ void POLYBENCH_HEAT_3D::runHipVariant(VariantID vid)
         RAJA::statement::HipKernelFixedAsync<j_block_sz * k_block_sz,
           RAJA::statement::Tile<1, RAJA::tile_fixed<j_block_sz>,
                                    RAJA::hip_block_y_direct,
-            RAJA::statement::Tile<0, RAJA::tile_fixed<k_block_sz>,
+            RAJA::statement::Tile<2, RAJA::tile_fixed<k_block_sz>,
                                      RAJA::hip_block_x_direct,
-              RAJA::statement::For<2, RAJA::hip_block_z_direct,      // i
+              RAJA::statement::For<0, RAJA::hip_block_z_direct,      // i
                 RAJA::statement::For<1, RAJA::hip_thread_y_direct,   // j
-                  RAJA::statement::For<0, RAJA::hip_thread_x_direct, // k
+                  RAJA::statement::For<2, RAJA::hip_thread_x_direct, // k
                     RAJA::statement::Lambda<0>
                   >
                 >
@@ -211,11 +223,12 @@ void POLYBENCH_HEAT_3D::runHipVariant(VariantID vid)
     POLYBENCH_HEAT_3D_TEARDOWN_HIP;
 
   } else {
-      std::cout << "\n  POLYBENCH_HEAT_3D : Unknown Hip variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_HEAT_3D : Unknown Hip variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_HEAT_3D, Hip)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp b/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp
index 21d454911..50ca323de 100644
--- a/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp
+++ b/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,13 +13,13 @@
 #include <iostream>
 
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
- 
-void POLYBENCH_HEAT_3D::runOpenMPVariant(VariantID vid)
+
+void POLYBENCH_HEAT_3D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -149,17 +149,17 @@ void POLYBENCH_HEAT_3D::runOpenMPVariant(VariantID vid)
       stopTimer();
 
       POLYBENCH_HEAT_3D_DATA_RESET;
-      
+
       break;
     }
 
     default : {
-      std::cout << "\n  POLYBENCH_HEAT_3D : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_HEAT_3D : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp b/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp
index 765a931d2..692689d85 100644
--- a/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp
@@ -1,10 +1,10 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// 
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 #include "POLYBENCH_HEAT_3D.hpp"
 
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
@@ -36,7 +36,7 @@ namespace polybench
   deallocOpenMPDeviceData(B, did);
 
 
-void POLYBENCH_HEAT_3D::runOpenMPTargetVariant(VariantID vid)
+void POLYBENCH_HEAT_3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -121,7 +121,7 @@ void POLYBENCH_HEAT_3D::runOpenMPTargetVariant(VariantID vid)
     POLYBENCH_HEAT_3D_TEARDOWN_OMP_TARGET;
 
   } else {
-      std::cout << "\n  POLYBENCH_HEAT_3D : Unknown OMP Target variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_HEAT_3D : Unknown OMP Target variant id = " << vid << std::endl;
   }
 
 }
diff --git a/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp b/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp
index ce2d76435..4afb06d21 100644
--- a/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp
+++ b/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,13 +13,13 @@
 #include <iostream>
 
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
 
-void POLYBENCH_HEAT_3D::runSeqVariant(VariantID vid)
+void POLYBENCH_HEAT_3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps= getRunReps();
 
@@ -32,19 +32,19 @@ void POLYBENCH_HEAT_3D::runSeqVariant(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        for (Index_type t = 0; t < tsteps; ++t) { 
+        for (Index_type t = 0; t < tsteps; ++t) {
 
-          for (Index_type i = 1; i < N-1; ++i ) { 
-            for (Index_type j = 1; j < N-1; ++j ) { 
-              for (Index_type k = 1; k < N-1; ++k ) { 
+          for (Index_type i = 1; i < N-1; ++i ) {
+            for (Index_type j = 1; j < N-1; ++j ) {
+              for (Index_type k = 1; k < N-1; ++k ) {
                 POLYBENCH_HEAT_3D_BODY1;
               }
             }
           }
 
-          for (Index_type i = 1; i < N-1; ++i ) { 
-            for (Index_type j = 1; j < N-1; ++j ) { 
-              for (Index_type k = 1; k < N-1; ++k ) { 
+          for (Index_type i = 1; i < N-1; ++i ) {
+            for (Index_type j = 1; j < N-1; ++j ) {
+              for (Index_type k = 1; k < N-1; ++k ) {
                 POLYBENCH_HEAT_3D_BODY2;
               }
             }
@@ -63,11 +63,11 @@ void POLYBENCH_HEAT_3D::runSeqVariant(VariantID vid)
 #if defined(RUN_RAJA_SEQ)
     case Lambda_Seq : {
 
-      auto poly_heat3d_base_lam1 = [=](Index_type i, Index_type j, 
+      auto poly_heat3d_base_lam1 = [=](Index_type i, Index_type j,
                                        Index_type k) {
                                      POLYBENCH_HEAT_3D_BODY1;
                                    };
-      auto poly_heat3d_base_lam2 = [=](Index_type i, Index_type j, 
+      auto poly_heat3d_base_lam2 = [=](Index_type i, Index_type j,
                                        Index_type k) {
                                      POLYBENCH_HEAT_3D_BODY2;
                                    };
@@ -155,7 +155,7 @@ void POLYBENCH_HEAT_3D::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  POLYBENCH_HEAT_3D : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_HEAT_3D : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp
index 85fd0ce38..567192b9a 100644
--- a/src/polybench/POLYBENCH_HEAT_3D.cpp
+++ b/src/polybench/POLYBENCH_HEAT_3D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -36,7 +36,7 @@ POLYBENCH_HEAT_3D::POLYBENCH_HEAT_3D(const RunParams& params)
   setItsPerRep( m_tsteps * ( 2 * getActualProblemSize() ) );
   setKernelsPerRep( m_tsteps * 2 );
   setBytesPerRep( m_tsteps * ( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) *
-                               (m_N-2) * (m_N-2) * (m_N-2) + 
+                               (m_N-2) * (m_N-2) * (m_N-2) +
                                (0*sizeof(Real_type ) + 1*sizeof(Real_type )) *
                                (m_N * m_N * m_N - 12*(m_N-2) - 8) +
                                (1*sizeof(Real_type ) + 0*sizeof(Real_type )) *
@@ -76,7 +76,7 @@ POLYBENCH_HEAT_3D::~POLYBENCH_HEAT_3D()
 {
 }
 
-void POLYBENCH_HEAT_3D::setUp(VariantID vid)
+void POLYBENCH_HEAT_3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   allocAndInitData(m_Ainit, m_N*m_N*m_N, vid);
@@ -85,13 +85,13 @@ void POLYBENCH_HEAT_3D::setUp(VariantID vid)
   allocAndInitDataConst(m_B, m_N*m_N*m_N, 0.0, vid);
 }
 
-void POLYBENCH_HEAT_3D::updateChecksum(VariantID vid)
+void POLYBENCH_HEAT_3D::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_A, m_N*m_N*m_N, checksum_scale_factor );
-  checksum[vid] += calcChecksum(m_B, m_N*m_N*m_N, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_A, m_N*m_N*m_N, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_B, m_N*m_N*m_N, checksum_scale_factor );
 }
 
-void POLYBENCH_HEAT_3D::tearDown(VariantID vid)
+void POLYBENCH_HEAT_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_A);
diff --git a/src/polybench/POLYBENCH_HEAT_3D.hpp b/src/polybench/POLYBENCH_HEAT_3D.hpp
index b21b56576..81ab06e0e 100644
--- a/src/polybench/POLYBENCH_HEAT_3D.hpp
+++ b/src/polybench/POLYBENCH_HEAT_3D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -115,17 +115,28 @@ class POLYBENCH_HEAT_3D : public KernelBase
 
   ~POLYBENCH_HEAT_3D();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
+                                                         gpu_block_size::MultipleOf<32>>;
+
   Index_type m_N;
   Index_type m_tsteps;
 
diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp
index 35f104444..a48e70a84 100644
--- a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,11 +21,6 @@ namespace rajaperf
 namespace polybench
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
 #define POLYBENCH_JACOBI_1D_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(A, m_Ainit, m_N); \
   allocAndInitCudaDeviceData(B, m_Binit, m_N);
@@ -38,18 +33,22 @@ namespace polybench
   deallocCudaDeviceData(B);
 
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_jacobi_1D_1(Real_ptr A, Real_ptr B, Index_type N)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
 
    if (i > 0 && i < N-1) {
      POLYBENCH_JACOBI_1D_BODY1;
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_jacobi_1D_2(Real_ptr A, Real_ptr B, Index_type N)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
 
    if (i > 0 && i < N-1) {
      POLYBENCH_JACOBI_1D_BODY2;
@@ -57,7 +56,8 @@ __global__ void poly_jacobi_1D_2(Real_ptr A, Real_ptr B, Index_type N)
 }
 
 
-void POLYBENCH_JACOBI_1D::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_JACOBI_1D::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -74,10 +74,10 @@ void POLYBENCH_JACOBI_1D::runCudaVariant(VariantID vid)
 
         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size);
 
-        poly_jacobi_1D_1<<<grid_size, block_size>>>(A, B, N);
+        poly_jacobi_1D_1<block_size><<<grid_size, block_size>>>(A, B, N);
         cudaErrchk( cudaGetLastError() );
 
-        poly_jacobi_1D_2<<<grid_size, block_size>>>(A, B, N);
+        poly_jacobi_1D_2<block_size><<<grid_size, block_size>>>(A, B, N);
         cudaErrchk( cudaGetLastError() );
 
       }
@@ -116,11 +116,12 @@ void POLYBENCH_JACOBI_1D::runCudaVariant(VariantID vid)
     POLYBENCH_JACOBI_1D_TEARDOWN_CUDA;
 
   } else {
-      std::cout << "\n  POLYBENCH_JACOBI_1D : Unknown Cuda variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_JACOBI_1D : Unknown Cuda variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_JACOBI_1D, Cuda)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp
index d566cd430..a5ff60dfc 100644
--- a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,11 +21,6 @@ namespace rajaperf
 namespace polybench
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
 #define POLYBENCH_JACOBI_1D_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(A, m_Ainit, m_N); \
   allocAndInitHipDeviceData(B, m_Binit, m_N);
@@ -38,18 +33,22 @@ namespace polybench
   deallocHipDeviceData(B);
 
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_jacobi_1D_1(Real_ptr A, Real_ptr B, Index_type N)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
 
    if (i > 0 && i < N-1) {
      POLYBENCH_JACOBI_1D_BODY1;
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_jacobi_1D_2(Real_ptr A, Real_ptr B, Index_type N)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
 
    if (i > 0 && i < N-1) {
      POLYBENCH_JACOBI_1D_BODY2;
@@ -57,7 +56,8 @@ __global__ void poly_jacobi_1D_2(Real_ptr A, Real_ptr B, Index_type N)
 }
 
 
-void POLYBENCH_JACOBI_1D::runHipVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_JACOBI_1D::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -74,11 +74,11 @@ void POLYBENCH_JACOBI_1D::runHipVariant(VariantID vid)
 
         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size);
 
-        hipLaunchKernelGGL((poly_jacobi_1D_1), dim3(grid_size), dim3(block_size), 0, 0,
+        hipLaunchKernelGGL((poly_jacobi_1D_1<block_size>), dim3(grid_size), dim3(block_size), 0, 0,
                                             A, B, N);
         hipErrchk( hipGetLastError() );
 
-        hipLaunchKernelGGL((poly_jacobi_1D_2), dim3(grid_size), dim3(block_size), 0, 0,
+        hipLaunchKernelGGL((poly_jacobi_1D_2<block_size>), dim3(grid_size), dim3(block_size), 0, 0,
                                             A, B, N);
         hipErrchk( hipGetLastError() );
 
@@ -118,11 +118,12 @@ void POLYBENCH_JACOBI_1D::runHipVariant(VariantID vid)
     POLYBENCH_JACOBI_1D_TEARDOWN_HIP;
 
   } else {
-      std::cout << "\n  POLYBENCH_JACOBI_1D : Unknown Hip variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_JACOBI_1D : Unknown Hip variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_JACOBI_1D, Hip)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp b/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp
index d709d7d04..d813f9d17 100644
--- a/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,13 +13,13 @@
 #include <iostream>
 
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
- 
-void POLYBENCH_JACOBI_1D::runOpenMPVariant(VariantID vid)
+
+void POLYBENCH_JACOBI_1D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -114,12 +114,12 @@ void POLYBENCH_JACOBI_1D::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  POLYBENCH_JACOBI_1D : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_JACOBI_1D : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp b/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp
index 774289a68..1ca122ebb 100644
--- a/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp
@@ -1,10 +1,10 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//  
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 #include "POLYBENCH_JACOBI_1D.hpp"
 
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
@@ -41,7 +41,7 @@ namespace polybench
   deallocOpenMPDeviceData(B, did);
 
 
-void POLYBENCH_JACOBI_1D::runOpenMPTargetVariant(VariantID vid)
+void POLYBENCH_JACOBI_1D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -55,7 +55,7 @@ void POLYBENCH_JACOBI_1D::runOpenMPTargetVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       for (Index_type t = 0; t < tsteps; ++t) {
-       
+
         #pragma omp target is_device_ptr(A,B) device( did )
         #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
         for (Index_type i = 1; i < N-1; ++i ) {
@@ -101,7 +101,7 @@ void POLYBENCH_JACOBI_1D::runOpenMPTargetVariant(VariantID vid)
     POLYBENCH_JACOBI_1D_TEARDOWN_OMP_TARGET;
 
   } else {
-      std::cout << "\n  POLYBENCH_JACOBI_1D : Unknown OMP Target variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_JACOBI_1D : Unknown OMP Target variant id = " << vid << std::endl;
   }
 
 }
@@ -110,4 +110,4 @@ void POLYBENCH_JACOBI_1D::runOpenMPTargetVariant(VariantID vid)
 } // end namespace rajaperf
 
 #endif  // RAJA_ENABLE_TARGET_OPENMP
-  
+
diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp
index 3592b3daf..f23ccdf06 100644
--- a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,13 +13,13 @@
 #include <iostream>
 
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
- 
-void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid)
+
+void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps= getRunReps();
 
@@ -39,12 +39,12 @@ void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        for (Index_type t = 0; t < tsteps; ++t) { 
+        for (Index_type t = 0; t < tsteps; ++t) {
 
-          for (Index_type i = 1; i < N-1; ++i ) { 
+          for (Index_type i = 1; i < N-1; ++i ) {
             POLYBENCH_JACOBI_1D_BODY1;
           }
-          for (Index_type i = 1; i < N-1; ++i ) { 
+          for (Index_type i = 1; i < N-1; ++i ) {
             POLYBENCH_JACOBI_1D_BODY2;
           }
 
@@ -95,7 +95,7 @@ void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid)
             poly_jacobi1d_lam1
           );
 
-          RAJA::forall<RAJA::loop_exec> ( RAJA::RangeSegment{1, N-1}, 
+          RAJA::forall<RAJA::loop_exec> ( RAJA::RangeSegment{1, N-1},
             poly_jacobi1d_lam2
           );
 
@@ -111,7 +111,7 @@ void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  POLYBENCH_JACOBI_1D : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_JACOBI_1D : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp
index 48c064780..f86bb5956 100644
--- a/src/polybench/POLYBENCH_JACOBI_1D.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -25,8 +25,8 @@ POLYBENCH_JACOBI_1D::POLYBENCH_JACOBI_1D(const RunParams& params)
 
   setDefaultProblemSize( N_default-2 );
   setDefaultReps(100);
- 
-  m_N = getTargetProblemSize(); 
+
+  m_N = getTargetProblemSize();
   m_tsteps = 16;
 
 
@@ -34,13 +34,13 @@ POLYBENCH_JACOBI_1D::POLYBENCH_JACOBI_1D(const RunParams& params)
 
   setItsPerRep( m_tsteps * ( 2 * getActualProblemSize() ) );
   setKernelsPerRep(m_tsteps * 2);
-  setBytesPerRep( m_tsteps * ( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * 
-                               (m_N-2) + 
-                               (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * 
+  setBytesPerRep( m_tsteps * ( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) *
+                               (m_N-2) +
+                               (0*sizeof(Real_type ) + 1*sizeof(Real_type )) *
                                m_N +
-                               (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * 
+                               (1*sizeof(Real_type ) + 0*sizeof(Real_type )) *
                                (m_N-2) +
-                               (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * 
+                               (0*sizeof(Real_type ) + 1*sizeof(Real_type )) *
                                m_N ) );
   setFLOPsPerRep( m_tsteps * ( 3 * (m_N-2) +
                                3 * (m_N-2) ) );
@@ -73,7 +73,7 @@ POLYBENCH_JACOBI_1D::~POLYBENCH_JACOBI_1D()
 {
 }
 
-void POLYBENCH_JACOBI_1D::setUp(VariantID vid)
+void POLYBENCH_JACOBI_1D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   allocAndInitData(m_Ainit, m_N, vid);
@@ -82,13 +82,13 @@ void POLYBENCH_JACOBI_1D::setUp(VariantID vid)
   allocAndInitDataConst(m_B, m_N, 0.0, vid);
 }
 
-void POLYBENCH_JACOBI_1D::updateChecksum(VariantID vid)
+void POLYBENCH_JACOBI_1D::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_A, m_N, checksum_scale_factor );
-  checksum[vid] += calcChecksum(m_B, m_N, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_A, m_N, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_B, m_N, checksum_scale_factor );
 }
 
-void POLYBENCH_JACOBI_1D::tearDown(VariantID vid)
+void POLYBENCH_JACOBI_1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_A);
diff --git a/src/polybench/POLYBENCH_JACOBI_1D.hpp b/src/polybench/POLYBENCH_JACOBI_1D.hpp
index 290e26ce0..cb3131490 100644
--- a/src/polybench/POLYBENCH_JACOBI_1D.hpp
+++ b/src/polybench/POLYBENCH_JACOBI_1D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -61,17 +61,27 @@ class POLYBENCH_JACOBI_1D : public KernelBase
 
   ~POLYBENCH_JACOBI_1D();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Index_type m_N;
   Index_type m_tsteps;
 
diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp
index a32a9cce6..ca6a485ec 100644
--- a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,13 +22,16 @@ namespace polybench
 {
 
   //
-  // Define thread block size for CUDA execution
+  // Define thread block shape for CUDA execution
   //
-  constexpr size_t i_block_sz = 8;
-  constexpr size_t j_block_sz = 32;
+#define j_block_sz (32)
+#define i_block_sz (block_size / j_block_sz)
+
+#define JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \
+  j_block_sz, i_block_sz
 
 #define JACOBI_2D_THREADS_PER_BLOCK_CUDA \
-  dim3 nthreads_per_block(j_block_sz, i_block_sz, 1);
+  dim3 nthreads_per_block(JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA, 1);
 
 #define JACOBI_2D_NBLOCKS_CUDA \
   dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N-2, j_block_sz)), \
@@ -48,31 +51,36 @@ namespace polybench
   deallocCudaDeviceData(B);
 
 
+template < size_t j_block_size, size_t i_block_size >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_jacobi_2D_1(Real_ptr A, Real_ptr B, Index_type N)
 {
-  Index_type i = 1 + blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = 1 + blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x;
 
   if ( i < N-1 && j < N-1 ) {
     POLYBENCH_JACOBI_2D_BODY1;
   }
 }
 
+template < size_t j_block_size, size_t i_block_size >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_jacobi_2D_2(Real_ptr A, Real_ptr B, Index_type N)
 {
-  Index_type i = 1 + blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = 1 + blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x;
 
   if ( i < N-1 && j < N-1 ) {
     POLYBENCH_JACOBI_2D_BODY2;
   }
 }
 
-template< typename Lambda >
+template < size_t j_block_size, size_t i_block_size, typename Lambda >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_jacobi_2D_lam(Index_type N, Lambda body)
 {
-  Index_type i = 1 + blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = 1 + blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x;
 
   if ( i < N-1 && j < N-1 ) {
     body(i, j);
@@ -80,7 +88,8 @@ __global__ void poly_jacobi_2D_lam(Index_type N, Lambda body)
 }
 
 
-void POLYBENCH_JACOBI_2D::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -98,10 +107,12 @@ void POLYBENCH_JACOBI_2D::runCudaVariant(VariantID vid)
         JACOBI_2D_THREADS_PER_BLOCK_CUDA;
         JACOBI_2D_NBLOCKS_CUDA;
 
-        poly_jacobi_2D_1<<<nblocks, nthreads_per_block>>>(A, B, N);
+        poly_jacobi_2D_1<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                        <<<nblocks, nthreads_per_block>>>(A, B, N);
         cudaErrchk( cudaGetLastError() );
 
-        poly_jacobi_2D_2<<<nblocks, nthreads_per_block>>>(A, B, N);
+        poly_jacobi_2D_2<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                        <<<nblocks, nthreads_per_block>>>(A, B, N);
         cudaErrchk( cudaGetLastError() );
 
       }
@@ -123,14 +134,16 @@ void POLYBENCH_JACOBI_2D::runCudaVariant(VariantID vid)
         JACOBI_2D_THREADS_PER_BLOCK_CUDA;
         JACOBI_2D_NBLOCKS_CUDA;
 
-        poly_jacobi_2D_lam<<<nblocks, nthreads_per_block>>>(N,
+        poly_jacobi_2D_lam<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                          <<<nblocks, nthreads_per_block>>>(N,
           [=] __device__ (Index_type i, Index_type j) {
             POLYBENCH_JACOBI_2D_BODY1;
           }
         );
         cudaErrchk( cudaGetLastError() );
 
-        poly_jacobi_2D_lam<<<nblocks, nthreads_per_block>>>(N,
+        poly_jacobi_2D_lam<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
+                          <<<nblocks, nthreads_per_block>>>(N,
           [=] __device__ (Index_type i, Index_type j) {
             POLYBENCH_JACOBI_2D_BODY2;
           }
@@ -165,7 +178,7 @@ void POLYBENCH_JACOBI_2D::runCudaVariant(VariantID vid)
             >
           >
         >
-      >;        
+      >;
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -194,11 +207,12 @@ void POLYBENCH_JACOBI_2D::runCudaVariant(VariantID vid)
     POLYBENCH_JACOBI_2D_TEARDOWN_CUDA;
 
   } else {
-      std::cout << "\n  POLYBENCH_JACOBI_2D : Unknown Cuda variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_JACOBI_2D : Unknown Cuda variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_JACOBI_2D, Cuda)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp
index dd7230205..bf03f9b86 100644
--- a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,13 +22,16 @@ namespace polybench
 {
 
   //
-  // Define thread block size for Hip execution
+  // Define thread block shape for Hip execution
   //
-  constexpr size_t i_block_sz = 8;
-  constexpr size_t j_block_sz = 32;
+#define j_block_sz (32)
+#define i_block_sz (block_size / j_block_sz)
+
+#define JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \
+  j_block_sz, i_block_sz
 
 #define JACOBI_2D_THREADS_PER_BLOCK_HIP \
-  dim3 nthreads_per_block(j_block_sz, i_block_sz, 1);
+  dim3 nthreads_per_block(JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, 1);
 
 #define JACOBI_2D_NBLOCKS_HIP \
   dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N-2, j_block_sz)), \
@@ -48,31 +51,36 @@ namespace polybench
   deallocHipDeviceData(B);
 
 
+template < size_t j_block_size, size_t i_block_size >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_jacobi_2D_1(Real_ptr A, Real_ptr B, Index_type N)
 {
-  Index_type i = 1 + blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = 1 + blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x;
 
   if ( i < N-1 && j < N-1 ) {
     POLYBENCH_JACOBI_2D_BODY1;
   }
 }
 
+template < size_t j_block_size, size_t i_block_size >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_jacobi_2D_2(Real_ptr A, Real_ptr B, Index_type N)
 {
-  Index_type i = 1 + blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = 1 + blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x;
 
   if ( i < N-1 && j < N-1 ) {
     POLYBENCH_JACOBI_2D_BODY2;
   }
 }
 
-template< typename Lambda >
+template < size_t j_block_size, size_t i_block_size, typename Lambda >
+__launch_bounds__(j_block_size*i_block_size)
 __global__ void poly_jacobi_2D_lam(Index_type N, Lambda body)
 {
-  Index_type i = 1 + blockIdx.y * blockDim.y + threadIdx.y;
-  Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = 1 + blockIdx.y * i_block_size + threadIdx.y;
+  Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x;
 
   if ( i < N-1 && j < N-1 ) {
     body(i, j);
@@ -80,7 +88,8 @@ __global__ void poly_jacobi_2D_lam(Index_type N, Lambda body)
 }
 
 
-void POLYBENCH_JACOBI_2D::runHipVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -98,12 +107,12 @@ void POLYBENCH_JACOBI_2D::runHipVariant(VariantID vid)
         JACOBI_2D_THREADS_PER_BLOCK_HIP;
         JACOBI_2D_NBLOCKS_HIP;
 
-        hipLaunchKernelGGL((poly_jacobi_2D_1),
+        hipLaunchKernelGGL((poly_jacobi_2D_1<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
                            dim3(nblocks), dim3(nthreads_per_block), 0, 0,
                            A, B, N);
         hipErrchk( hipGetLastError() );
 
-        hipLaunchKernelGGL((poly_jacobi_2D_2),
+        hipLaunchKernelGGL((poly_jacobi_2D_2<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
                            dim3(nblocks), dim3(nthreads_per_block), 0, 0,
                            A, B, N);
         hipErrchk( hipGetLastError() );
@@ -127,22 +136,22 @@ void POLYBENCH_JACOBI_2D::runHipVariant(VariantID vid)
         JACOBI_2D_THREADS_PER_BLOCK_HIP;
         JACOBI_2D_NBLOCKS_HIP;
 
-        auto poly_jacobi_2D_1_lambda = 
+        auto poly_jacobi_2D_1_lambda =
           [=] __device__ (Index_type i, Index_type j) {
             POLYBENCH_JACOBI_2D_BODY1;
           };
 
-        hipLaunchKernelGGL((poly_jacobi_2D_lam<decltype(poly_jacobi_2D_1_lambda)>),
+        hipLaunchKernelGGL((poly_jacobi_2D_lam<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_jacobi_2D_1_lambda)>),
                            dim3(nblocks), dim3(nthreads_per_block), 0, 0,
                            N, poly_jacobi_2D_1_lambda);
         hipErrchk( hipGetLastError() );
 
-        auto poly_jacobi_2D_2_lambda = 
+        auto poly_jacobi_2D_2_lambda =
           [=] __device__ (Index_type i, Index_type j) {
             POLYBENCH_JACOBI_2D_BODY2;
           };
 
-        hipLaunchKernelGGL((poly_jacobi_2D_lam<decltype(poly_jacobi_2D_2_lambda)>),
+        hipLaunchKernelGGL((poly_jacobi_2D_lam<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_jacobi_2D_2_lambda)>),
                            dim3(nblocks), dim3(nthreads_per_block), 0, 0,
                            N, poly_jacobi_2D_2_lambda);
         hipErrchk( hipGetLastError() );
@@ -204,11 +213,12 @@ void POLYBENCH_JACOBI_2D::runHipVariant(VariantID vid)
     POLYBENCH_JACOBI_2D_TEARDOWN_HIP;
 
   } else {
-      std::cout << "\n  POLYBENCH_JACOBI_2D : Unknown Hip variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_JACOBI_2D : Unknown Hip variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_JACOBI_2D, Hip)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp b/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp
index e8a7b80ea..4acf70b25 100644
--- a/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,13 +13,13 @@
 #include <iostream>
 
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
 
-void POLYBENCH_JACOBI_2D::runOpenMPVariant(VariantID vid)
+void POLYBENCH_JACOBI_2D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -37,19 +37,19 @@ void POLYBENCH_JACOBI_2D::runOpenMPVariant(VariantID vid)
         for (Index_type t = 0; t < tsteps; ++t) {
 
           #pragma omp parallel for
-          for (Index_type i = 1; i < N-1; ++i ) { 
-            for (Index_type j = 1; j < N-1; ++j ) { 
+          for (Index_type i = 1; i < N-1; ++i ) {
+            for (Index_type j = 1; j < N-1; ++j ) {
               POLYBENCH_JACOBI_2D_BODY1;
             }
           }
 
           #pragma omp parallel for
-          for (Index_type i = 1; i < N-1; ++i ) { 
-            for (Index_type j = 1; j < N-1; ++j ) { 
+          for (Index_type i = 1; i < N-1; ++i ) {
+            for (Index_type j = 1; j < N-1; ++j ) {
               POLYBENCH_JACOBI_2D_BODY2;
             }
           }
-           
+
         }
 
       }
@@ -59,7 +59,7 @@ void POLYBENCH_JACOBI_2D::runOpenMPVariant(VariantID vid)
 
       break;
     }
-  
+
     case Lambda_OpenMP : {
 
       auto poly_jacobi2d_base_lam1 = [=](Index_type i, Index_type j) {
@@ -146,12 +146,12 @@ void POLYBENCH_JACOBI_2D::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  POLYBENCH_JACOBI_2D : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_JACOBI_2D : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp b/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp
index 91d5122cc..9538d50f7 100644
--- a/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp
@@ -1,10 +1,10 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//  
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 #include "POLYBENCH_JACOBI_2D.hpp"
 
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
@@ -36,7 +36,7 @@ namespace polybench
   deallocOpenMPDeviceData(B, did);
 
 
-void POLYBENCH_JACOBI_2D::runOpenMPTargetVariant(VariantID vid)
+void POLYBENCH_JACOBI_2D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -58,14 +58,14 @@ void POLYBENCH_JACOBI_2D::runOpenMPTargetVariant(VariantID vid)
             POLYBENCH_JACOBI_2D_BODY1;
           }
         }
-    
+
         #pragma omp target is_device_ptr(A,B) device( did )
         #pragma omp teams distribute parallel for schedule(static, 1) collapse(2)
         for (Index_type i = 1; i < N-1; ++i ) {
           for (Index_type j = 1; j < N-1; ++j ) {
             POLYBENCH_JACOBI_2D_BODY2;
           }
-        }  
+        }
 
       }
 
@@ -83,11 +83,11 @@ void POLYBENCH_JACOBI_2D::runOpenMPTargetVariant(VariantID vid)
     using EXEC_POL =
       RAJA::KernelPolicy<
         RAJA::statement::Collapse<RAJA::omp_target_parallel_collapse_exec,
-                                  RAJA::ArgList<0, 1>, 
+                                  RAJA::ArgList<0, 1>,
           RAJA::statement::Lambda<0>
         >,
         RAJA::statement::Collapse<RAJA::omp_target_parallel_collapse_exec,
-                                  RAJA::ArgList<0, 1>, 
+                                  RAJA::ArgList<0, 1>,
           RAJA::statement::Lambda<1>
         >
       >;
@@ -115,7 +115,7 @@ void POLYBENCH_JACOBI_2D::runOpenMPTargetVariant(VariantID vid)
     POLYBENCH_JACOBI_2D_TEARDOWN_CUDA;
 
   } else {
-      std::cout << "\n  POLYBENCH_JACOBI_2D : Unknown OMP Target variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_JACOBI_2D : Unknown OMP Target variant id = " << vid << std::endl;
   }
 
 }
@@ -124,4 +124,4 @@ void POLYBENCH_JACOBI_2D::runOpenMPTargetVariant(VariantID vid)
 } // end namespace rajaperf
 
 #endif  // RAJA_ENABLE_TARGET_OPENMP
-  
+
diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp
index aef47da43..856404f92 100644
--- a/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,13 +13,13 @@
 #include <iostream>
 
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
 
-void POLYBENCH_JACOBI_2D::runSeqVariant(VariantID vid)
+void POLYBENCH_JACOBI_2D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps= getRunReps();
 
@@ -32,15 +32,15 @@ void POLYBENCH_JACOBI_2D::runSeqVariant(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        for (Index_type t = 0; t < tsteps; ++t) { 
+        for (Index_type t = 0; t < tsteps; ++t) {
 
-          for (Index_type i = 1; i < N-1; ++i ) { 
-            for (Index_type j = 1; j < N-1; ++j ) { 
+          for (Index_type i = 1; i < N-1; ++i ) {
+            for (Index_type j = 1; j < N-1; ++j ) {
               POLYBENCH_JACOBI_2D_BODY1;
             }
           }
-          for (Index_type i = 1; i < N-1; ++i ) { 
-            for (Index_type j = 1; j < N-1; ++j ) { 
+          for (Index_type i = 1; i < N-1; ++i ) {
+            for (Index_type j = 1; j < N-1; ++j ) {
               POLYBENCH_JACOBI_2D_BODY2;
             }
           }
@@ -142,7 +142,7 @@ void POLYBENCH_JACOBI_2D::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  POLYBENCH_JACOBI_2D : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_JACOBI_2D : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp
index 9e204bdab..1b4f9378a 100644
--- a/src/polybench/POLYBENCH_JACOBI_2D.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -75,7 +75,7 @@ POLYBENCH_JACOBI_2D::~POLYBENCH_JACOBI_2D()
 {
 }
 
-void POLYBENCH_JACOBI_2D::setUp(VariantID vid)
+void POLYBENCH_JACOBI_2D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   allocAndInitData(m_Ainit, m_N*m_N, vid);
@@ -84,13 +84,13 @@ void POLYBENCH_JACOBI_2D::setUp(VariantID vid)
   allocAndInitDataConst(m_B, m_N*m_N, 0.0, vid);
 }
 
-void POLYBENCH_JACOBI_2D::updateChecksum(VariantID vid)
+void POLYBENCH_JACOBI_2D::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_A, m_N*m_N, checksum_scale_factor );
-  checksum[vid] += calcChecksum(m_B, m_N*m_N, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_A, m_N*m_N, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_B, m_N*m_N, checksum_scale_factor );
 }
 
-void POLYBENCH_JACOBI_2D::tearDown(VariantID vid)
+void POLYBENCH_JACOBI_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_A);
diff --git a/src/polybench/POLYBENCH_JACOBI_2D.hpp b/src/polybench/POLYBENCH_JACOBI_2D.hpp
index 9a57325a1..a2ba63181 100644
--- a/src/polybench/POLYBENCH_JACOBI_2D.hpp
+++ b/src/polybench/POLYBENCH_JACOBI_2D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -81,17 +81,28 @@ class POLYBENCH_JACOBI_2D : public KernelBase
 
   ~POLYBENCH_JACOBI_2D();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
+                                                         gpu_block_size::MultipleOf<32>>;
+
   Index_type m_N;
   Index_type m_tsteps;
 
diff --git a/src/polybench/POLYBENCH_MVT-Cuda.cpp b/src/polybench/POLYBENCH_MVT-Cuda.cpp
index 2a59f018f..2795cadbb 100644
--- a/src/polybench/POLYBENCH_MVT-Cuda.cpp
+++ b/src/polybench/POLYBENCH_MVT-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,11 +21,6 @@ namespace rajaperf
 namespace polybench
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
 #define POLYBENCH_MVT_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(x1, m_x1, N); \
   allocAndInitCudaDeviceData(x2, m_x2, N); \
@@ -44,10 +39,12 @@ namespace polybench
   deallocCudaDeviceData(A);
 
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_mvt_1(Real_ptr A, Real_ptr x1, Real_ptr y1,
                            Index_type N)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
 
    if (i < N) {
      POLYBENCH_MVT_BODY1;
@@ -58,10 +55,12 @@ __global__ void poly_mvt_1(Real_ptr A, Real_ptr x1, Real_ptr y1,
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_mvt_2(Real_ptr A, Real_ptr x2, Real_ptr y2,
                            Index_type N)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
 
    if (i < N) {
      POLYBENCH_MVT_BODY4;
@@ -73,7 +72,8 @@ __global__ void poly_mvt_2(Real_ptr A, Real_ptr x2, Real_ptr y2,
 }
 
 
-void POLYBENCH_MVT::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_MVT::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -88,10 +88,10 @@ void POLYBENCH_MVT::runCudaVariant(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size);
 
-      poly_mvt_1<<<grid_size, block_size>>>(A, x1, y1, N);
+      poly_mvt_1<block_size><<<grid_size, block_size>>>(A, x1, y1, N);
       cudaErrchk( cudaGetLastError() );
 
-      poly_mvt_2<<<grid_size, block_size>>>(A, x2, y2, N);
+      poly_mvt_2<block_size><<<grid_size, block_size>>>(A, x2, y2, N);
       cudaErrchk( cudaGetLastError() );
 
     }
@@ -174,11 +174,12 @@ void POLYBENCH_MVT::runCudaVariant(VariantID vid)
     POLYBENCH_MVT_TEARDOWN_CUDA;
 
   } else {
-      std::cout << "\n  POLYBENCH_MVT : Unknown Cuda variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_MVT : Unknown Cuda variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_MVT, Cuda)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_MVT-Hip.cpp b/src/polybench/POLYBENCH_MVT-Hip.cpp
index 00619eee5..176c41710 100644
--- a/src/polybench/POLYBENCH_MVT-Hip.cpp
+++ b/src/polybench/POLYBENCH_MVT-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,11 +21,6 @@ namespace rajaperf
 namespace polybench
 {
 
-  //
-  // Define thread block size for Hip execution
-  //
-  const size_t block_size = 256;
-
 #define POLYBENCH_MVT_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(x1, m_x1, N); \
   allocAndInitHipDeviceData(x2, m_x2, N); \
@@ -44,10 +39,12 @@ namespace polybench
   deallocHipDeviceData(A);
 
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_mvt_1(Real_ptr A, Real_ptr x1, Real_ptr y1,
                            Index_type N)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
 
    if (i < N) {
      POLYBENCH_MVT_BODY1;
@@ -58,10 +55,12 @@ __global__ void poly_mvt_1(Real_ptr A, Real_ptr x1, Real_ptr y1,
    }
 }
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void poly_mvt_2(Real_ptr A, Real_ptr x2, Real_ptr y2,
                            Index_type N)
 {
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = blockIdx.x * block_size + threadIdx.x;
 
    if (i < N) {
      POLYBENCH_MVT_BODY4;
@@ -73,7 +72,8 @@ __global__ void poly_mvt_2(Real_ptr A, Real_ptr x2, Real_ptr y2,
 }
 
 
-void POLYBENCH_MVT::runHipVariant(VariantID vid)
+template < size_t block_size >
+void POLYBENCH_MVT::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
@@ -88,12 +88,12 @@ void POLYBENCH_MVT::runHipVariant(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size);
 
-      hipLaunchKernelGGL((poly_mvt_1),
+      hipLaunchKernelGGL((poly_mvt_1<block_size>),
                          dim3(grid_size), dim3(block_size), 0, 0,
                          A, x1, y1, N);
       hipErrchk( hipGetLastError() );
 
-      hipLaunchKernelGGL((poly_mvt_2),
+      hipLaunchKernelGGL((poly_mvt_2<block_size>),
                          dim3(grid_size), dim3(block_size), 0, 0,
                          A, x2, y2, N);
       hipErrchk( hipGetLastError() );
@@ -172,11 +172,12 @@ void POLYBENCH_MVT::runHipVariant(VariantID vid)
     POLYBENCH_MVT_TEARDOWN_HIP;
 
   } else {
-      std::cout << "\n  POLYBENCH_MVT : Unknown Hip variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_MVT : Unknown Hip variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_MVT, Hip)
+
 } // end namespace polybench
 } // end namespace rajaperf
 
diff --git a/src/polybench/POLYBENCH_MVT-OMP.cpp b/src/polybench/POLYBENCH_MVT-OMP.cpp
index 10b920848..f5dad16b9 100644
--- a/src/polybench/POLYBENCH_MVT-OMP.cpp
+++ b/src/polybench/POLYBENCH_MVT-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,13 +13,13 @@
 #include <iostream>
 
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
- 
-void POLYBENCH_MVT::runOpenMPVariant(VariantID vid)
+
+void POLYBENCH_MVT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -38,7 +38,7 @@ void POLYBENCH_MVT::runOpenMPVariant(VariantID vid)
         {
 
           #pragma omp for schedule(static) nowait
-          for (Index_type i = 0; i < N; ++i ) { 
+          for (Index_type i = 0; i < N; ++i ) {
             POLYBENCH_MVT_BODY1;
             for (Index_type j = 0; j < N; ++j ) {
               POLYBENCH_MVT_BODY2;
@@ -47,7 +47,7 @@ void POLYBENCH_MVT::runOpenMPVariant(VariantID vid)
           }
 
           #pragma omp for schedule(static) nowait
-          for (Index_type i = 0; i < N; ++i ) { 
+          for (Index_type i = 0; i < N; ++i ) {
             POLYBENCH_MVT_BODY4;
             for (Index_type j = 0; j < N; ++j ) {
               POLYBENCH_MVT_BODY5;
@@ -140,7 +140,7 @@ void POLYBENCH_MVT::runOpenMPVariant(VariantID vid)
       using EXEC_POL =
         RAJA::KernelPolicy<
           RAJA::statement::For<0, RAJA::omp_for_nowait_static_exec< >, // i
-            RAJA::statement::Lambda<0, RAJA::Params<0>>,  
+            RAJA::statement::Lambda<0, RAJA::Params<0>>,
             RAJA::statement::For<1, RAJA::loop_exec,                   // j
               RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0>>
             >,
@@ -157,22 +157,22 @@ void POLYBENCH_MVT::runOpenMPVariant(VariantID vid)
             RAJA::make_tuple(RAJA::RangeSegment{0, N},
                              RAJA::RangeSegment{0, N}),
             RAJA::tuple<Real_type>{0.0},
- 
+
             poly_mvt_lam1,
             poly_mvt_lam2,
             poly_mvt_lam3
- 
+
           );
 
           RAJA::kernel_param<EXEC_POL>(
             RAJA::make_tuple(RAJA::RangeSegment{0, N},
                              RAJA::RangeSegment{0, N}),
             RAJA::tuple<Real_type>{0.0},
- 
+
             poly_mvt_lam4,
-            poly_mvt_lam5, 
+            poly_mvt_lam5,
             poly_mvt_lam6
- 
+
           );
 
         }); // end omp parallel region
@@ -184,12 +184,12 @@ void POLYBENCH_MVT::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  POLYBENCH_MVT : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_MVT : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/polybench/POLYBENCH_MVT-OMPTarget.cpp b/src/polybench/POLYBENCH_MVT-OMPTarget.cpp
index 4fe035e01..acd7ad56a 100644
--- a/src/polybench/POLYBENCH_MVT-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_MVT-OMPTarget.cpp
@@ -1,10 +1,10 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//  
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 #include "POLYBENCH_MVT.hpp"
 
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
@@ -47,7 +47,7 @@ namespace polybench
   deallocOpenMPDeviceData(A, did);
 
 
-void POLYBENCH_MVT::runOpenMPTargetVariant(VariantID vid)
+void POLYBENCH_MVT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
@@ -149,7 +149,7 @@ void POLYBENCH_MVT::runOpenMPTargetVariant(VariantID vid)
     POLYBENCH_MVT_TEARDOWN_OMP_TARGET;
 
   } else {
-      std::cout << "\n  POLYBENCH_MVT : Unknown OMP Target variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_MVT : Unknown OMP Target variant id = " << vid << std::endl;
   }
 
 }
@@ -158,4 +158,4 @@ void POLYBENCH_MVT::runOpenMPTargetVariant(VariantID vid)
 } // end namespace rajaperf
 
 #endif  // RAJA_ENABLE_TARGET_OPENMP
-  
+
diff --git a/src/polybench/POLYBENCH_MVT-Seq.cpp b/src/polybench/POLYBENCH_MVT-Seq.cpp
index fd8be8659..8d115b94e 100644
--- a/src/polybench/POLYBENCH_MVT-Seq.cpp
+++ b/src/polybench/POLYBENCH_MVT-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -13,13 +13,13 @@
 #include <iostream>
 
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace polybench
 {
 
 
-void POLYBENCH_MVT::runSeqVariant(VariantID vid)
+void POLYBENCH_MVT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps= getRunReps();
 
@@ -32,7 +32,7 @@ void POLYBENCH_MVT::runSeqVariant(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        for (Index_type i = 0; i < N; ++i ) { 
+        for (Index_type i = 0; i < N; ++i ) {
           POLYBENCH_MVT_BODY1;
           for (Index_type j = 0; j < N; ++j ) {
             POLYBENCH_MVT_BODY2;
@@ -40,7 +40,7 @@ void POLYBENCH_MVT::runSeqVariant(VariantID vid)
           POLYBENCH_MVT_BODY3;
         }
 
-        for (Index_type i = 0; i < N; ++i ) { 
+        for (Index_type i = 0; i < N; ++i ) {
           POLYBENCH_MVT_BODY4;
           for (Index_type j = 0; j < N; ++j ) {
             POLYBENCH_MVT_BODY5;
@@ -58,19 +58,19 @@ void POLYBENCH_MVT::runSeqVariant(VariantID vid)
 #if defined(RUN_RAJA_SEQ)
     case Lambda_Seq : {
 
-      auto poly_mvt_base_lam2 = [=] (Index_type i, Index_type j, 
+      auto poly_mvt_base_lam2 = [=] (Index_type i, Index_type j,
                                      Real_type &dot) {
                                   POLYBENCH_MVT_BODY2;
                                  };
-      auto poly_mvt_base_lam3 = [=] (Index_type i, 
+      auto poly_mvt_base_lam3 = [=] (Index_type i,
                                      Real_type &dot) {
                                   POLYBENCH_MVT_BODY3;
                                 };
-      auto poly_mvt_base_lam5 = [=] (Index_type i, Index_type j, 
+      auto poly_mvt_base_lam5 = [=] (Index_type i, Index_type j,
                                      Real_type &dot) {
                                   POLYBENCH_MVT_BODY5;
                                 };
-      auto poly_mvt_base_lam6 = [=] (Index_type i, 
+      auto poly_mvt_base_lam6 = [=] (Index_type i,
                                      Real_type &dot) {
                                   POLYBENCH_MVT_BODY6;
                                 };
@@ -125,9 +125,9 @@ void POLYBENCH_MVT::runSeqVariant(VariantID vid)
 
       using EXEC_POL =
         RAJA::KernelPolicy<
-          RAJA::statement::For<0, RAJA::loop_exec,    // i   
+          RAJA::statement::For<0, RAJA::loop_exec,    // i
             RAJA::statement::Lambda<0, RAJA::Params<0>>,
-            RAJA::statement::For<1, RAJA::loop_exec,  // j 
+            RAJA::statement::For<1, RAJA::loop_exec,  // j
               RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0>>
             >,
             RAJA::statement::Lambda<2, RAJA::Segs<0>, RAJA::Params<0>>
@@ -139,26 +139,26 @@ void POLYBENCH_MVT::runSeqVariant(VariantID vid)
 
         RAJA::region<RAJA::seq_region>( [=]() {
 
-          RAJA::kernel_param<EXEC_POL>( 
+          RAJA::kernel_param<EXEC_POL>(
             RAJA::make_tuple(RAJA::RangeSegment{0, N},
                              RAJA::RangeSegment{0, N}),
             RAJA::tuple<Real_type>{0.0},
- 
+
             poly_mvt_lam1,
             poly_mvt_lam2,
             poly_mvt_lam3
- 
+
           );
 
-          RAJA::kernel_param<EXEC_POL>( 
+          RAJA::kernel_param<EXEC_POL>(
             RAJA::make_tuple(RAJA::RangeSegment{0, N},
                              RAJA::RangeSegment{0, N}),
             RAJA::tuple<Real_type>{0.0},
- 
+
             poly_mvt_lam4,
-            poly_mvt_lam5, 
+            poly_mvt_lam5,
             poly_mvt_lam6
- 
+
           );
 
         }); // end sequential region (for single-source code)
@@ -171,7 +171,7 @@ void POLYBENCH_MVT::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  POLYBENCH_MVT : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  POLYBENCH_MVT : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp
index ae2749ce5..3354ca97d 100644
--- a/src/polybench/POLYBENCH_MVT.cpp
+++ b/src/polybench/POLYBENCH_MVT.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -68,7 +68,7 @@ POLYBENCH_MVT::~POLYBENCH_MVT()
 {
 }
 
-void POLYBENCH_MVT::setUp(VariantID vid)
+void POLYBENCH_MVT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   allocAndInitData(m_y1, m_N, vid);
@@ -78,13 +78,13 @@ void POLYBENCH_MVT::setUp(VariantID vid)
   allocAndInitDataConst(m_x2, m_N, 0.0, vid);
 }
 
-void POLYBENCH_MVT::updateChecksum(VariantID vid)
+void POLYBENCH_MVT::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_x1, m_N, checksum_scale_factor );
-  checksum[vid] += calcChecksum(m_x2, m_N, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_x1, m_N, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_x2, m_N, checksum_scale_factor );
 }
 
-void POLYBENCH_MVT::tearDown(VariantID vid)
+void POLYBENCH_MVT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_x1);
diff --git a/src/polybench/POLYBENCH_MVT.hpp b/src/polybench/POLYBENCH_MVT.hpp
index cb72784ed..dce40baf2 100644
--- a/src/polybench/POLYBENCH_MVT.hpp
+++ b/src/polybench/POLYBENCH_MVT.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -103,17 +103,27 @@ class POLYBENCH_MVT : public KernelBase
 
   ~POLYBENCH_MVT();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Index_type m_N;
   Real_ptr m_x1;
   Real_ptr m_x2;
diff --git a/src/rajaperf_config.hpp.in b/src/rajaperf_config.hpp.in
index c34f9120c..43ae990a9 100644
--- a/src/rajaperf_config.hpp.in
+++ b/src/rajaperf_config.hpp.in
@@ -9,7 +9,7 @@
  */
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,13 +21,17 @@
 #define RAJAPerf_config_HPP
 
 #include "RAJA/config.hpp"
+#include "camp/number.hpp"
 
 #include <string>
 
+#cmakedefine RAJA_PERFSUITE_ENABLE_MPI
+#cmakedefine RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN
+
 namespace rajaperf {
 
 struct configuration {
-
+#if 0
 // Version of RAJA Perf Suite (ex: 0.1.0)
 static const std::string perfsuite_version =
 "@RAJA_PERFSUITE_VERSION_MAJOR@" + std::string(".") +
@@ -35,7 +39,7 @@ static const std::string perfsuite_version =
 "@RAJA_PERFSUITE_VERSION_PATCHLEVEL@";
 
 // Version of RAJA used to build (ex: 0.2.4)
-static const std::string raja_version = 
+static const std::string raja_version =
 std::to_string(RAJA::RAJA_VERSION_MAJOR) + std::string(".") +
 std::to_string(RAJA::RAJA_VERSION_MINOR) + std::string(".") +
 std::to_string(RAJA::RAJA_VERSION_PATCH_LEVEL);
@@ -43,13 +47,20 @@ std::to_string(RAJA::RAJA_VERSION_PATCH_LEVEL);
 // Systype and machine code was built on (ex: chaos_5_x64_64, rzhasgpu18)
 static const std::string systype_build = "@RAJAPERF_BUILD_SYSTYPE@";
 static const std::string machine_build = "@RAJAPERF_BUILD_HOST@";
-		
+
 // Compiler used to build (ex: gcc-4.9.3)
 static const std::string compiler = "@RAJAPERF_COMPILER@";
 
 // Command options used to build (ex: -Ofast -mavx)
 static const std::string compiler_options = "@RAJAPERF_COMPILER_OPTIONS@";
-		
+#endif
+
+// helper alias to void trailing comma in no-arg case
+template < size_t... Is >
+using i_seq = camp::int_seq<size_t, Is...>;
+// List of GPU block sizes
+using gpu_block_sizes = i_seq<@RAJA_PERFSUITE_GPU_BLOCKSIZES@>;
+
 // Name of user who ran code
 std::string user_run;
 
@@ -59,9 +70,16 @@ std::string date_run;
 // Systype and machine code ran on (ex: chaos_5_x64_64)
 std::string systype_run;
 std::string machine_run;
-		
+
 };
 
 } // closing brace for rajaperf namespace
 
+// Squash compiler warnings about unused variables
+template < typename ... Ts >
+inline void RAJAPERF_UNUSED_VAR(Ts&&...) { }
+
+// Squash compiler warnings about unused arguments
+#define RAJAPERF_UNUSED_ARG(...)
+
 #endif  // closing endif for header file include guard
diff --git a/src/stream/ADD-Cuda.cpp b/src/stream/ADD-Cuda.cpp
index 4fc0a4f0e..102774a13 100644
--- a/src/stream/ADD-Cuda.cpp
+++ b/src/stream/ADD-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace stream
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define ADD_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(a, m_a, iend); \
   allocAndInitCudaDeviceData(b, m_b, iend); \
@@ -38,17 +32,20 @@ namespace stream
   deallocCudaDeviceData(b); \
   deallocCudaDeviceData(c);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void add(Real_ptr c, Real_ptr a, Real_ptr b,
                      Index_type iend)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < iend) {
     ADD_BODY;
   }
 }
 
 
-void ADD::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void ADD::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -64,7 +61,7 @@ void ADD::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      add<<<grid_size, block_size>>>( c, a, b,
+      add<block_size><<<grid_size, block_size>>>( c, a, b,
                                       iend );
       cudaErrchk( cudaGetLastError() );
 
@@ -81,7 +78,7 @@ void ADD::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      lambda_cuda_forall<<<grid_size, block_size>>>(
+      lambda_cuda_forall<block_size><<<grid_size, block_size>>>(
         ibegin, iend, [=] __device__ (Index_type i) {
         ADD_BODY;
       });
@@ -110,10 +107,12 @@ void ADD::runCudaVariant(VariantID vid)
     ADD_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  ADD : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  ADD : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(ADD, Cuda)
+
 } // end namespace stream
 } // end namespace rajaperf
 
diff --git a/src/stream/ADD-Hip.cpp b/src/stream/ADD-Hip.cpp
index 68b671a63..5e53500c8 100644
--- a/src/stream/ADD-Hip.cpp
+++ b/src/stream/ADD-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace stream
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define ADD_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(a, m_a, iend); \
   allocAndInitHipDeviceData(b, m_b, iend); \
@@ -38,17 +32,20 @@ namespace stream
   deallocHipDeviceData(b); \
   deallocHipDeviceData(c);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void add(Real_ptr c, Real_ptr a, Real_ptr b,
                      Index_type iend)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < iend) {
     ADD_BODY;
   }
 }
 
 
-void ADD::runHipVariant(VariantID vid)
+template < size_t block_size >
+void ADD::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -64,7 +61,7 @@ void ADD::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((add), dim3(grid_size), dim3(block_size), 0, 0,  c, a, b,
+      hipLaunchKernelGGL((add<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  c, a, b,
                                       iend );
       hipErrchk( hipGetLastError() );
 
@@ -85,7 +82,7 @@ void ADD::runHipVariant(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL(lambda_hip_forall<decltype(add_lambda)>,
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(add_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, add_lambda);
       hipErrchk( hipGetLastError() );
 
@@ -112,10 +109,12 @@ void ADD::runHipVariant(VariantID vid)
     ADD_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  ADD : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  ADD : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(ADD, Hip)
+
 } // end namespace stream
 } // end namespace rajaperf
 
diff --git a/src/stream/ADD-OMP.cpp b/src/stream/ADD-OMP.cpp
index c73b5c5c9..ae425a93f 100644
--- a/src/stream/ADD-OMP.cpp
+++ b/src/stream/ADD-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,15 +12,15 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace stream
 {
 
- 
-void ADD::runOpenMPVariant(VariantID vid)
+
+void ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
-#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)                        
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -81,12 +81,12 @@ void ADD::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  ADD : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  ADD : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/stream/ADD-OMPTarget.cpp b/src/stream/ADD-OMPTarget.cpp
index d83bf1507..2089472fa 100644
--- a/src/stream/ADD-OMPTarget.cpp
+++ b/src/stream/ADD-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -40,7 +40,7 @@ namespace stream
   deallocOpenMPDeviceData(b, did); \
   deallocOpenMPDeviceData(c, did);
 
-void ADD::runOpenMPTargetVariant(VariantID vid)
+void ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -84,7 +84,7 @@ void ADD::runOpenMPTargetVariant(VariantID vid)
     ADD_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  ADD : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  ADD : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/stream/ADD-Seq.cpp b/src/stream/ADD-Seq.cpp
index 8b670fc0e..f421d44c2 100644
--- a/src/stream/ADD-Seq.cpp
+++ b/src/stream/ADD-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace stream
 {
 
- 
-void ADD::runSeqVariant(VariantID vid)
+
+void ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -61,7 +61,7 @@ void ADD::runSeqVariant(VariantID vid)
       stopTimer();
 
       break;
-    } 
+    }
 
     case RAJA_Seq : {
 
@@ -79,7 +79,7 @@ void ADD::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  ADD : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  ADD : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp
index 200172e60..904c0804b 100644
--- a/src/stream/ADD.cpp
+++ b/src/stream/ADD.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,7 @@ ADD::ADD(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * 
+  setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) *
                   getActualProblemSize() );
   setFLOPsPerRep(1 * getActualProblemSize());
 
@@ -58,19 +58,19 @@ ADD::~ADD()
 {
 }
 
-void ADD::setUp(VariantID vid)
+void ADD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitData(m_a, getActualProblemSize(), vid);
   allocAndInitData(m_b, getActualProblemSize(), vid);
   allocAndInitDataConst(m_c, getActualProblemSize(), 0.0, vid);
 }
 
-void ADD::updateChecksum(VariantID vid)
+void ADD::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_c, getActualProblemSize());
+  checksum[vid][tune_idx] += calcChecksum(m_c, getActualProblemSize());
 }
 
-void ADD::tearDown(VariantID vid)
+void ADD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_a);
diff --git a/src/stream/ADD.hpp b/src/stream/ADD.hpp
index 0bf45b810..07d0dea79 100644
--- a/src/stream/ADD.hpp
+++ b/src/stream/ADD.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -43,17 +43,27 @@ class ADD : public KernelBase
 
   ~ADD();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_a;
   Real_ptr m_b;
   Real_ptr m_c;
diff --git a/src/stream/CMakeLists.txt b/src/stream/CMakeLists.txt
index e70ed685d..2122b7867 100644
--- a/src/stream/CMakeLists.txt
+++ b/src/stream/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
diff --git a/src/stream/COPY-Cuda.cpp b/src/stream/COPY-Cuda.cpp
index 62afb9ad8..cddf986ac 100644
--- a/src/stream/COPY-Cuda.cpp
+++ b/src/stream/COPY-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace stream
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define COPY_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(a, m_a, iend); \
   allocAndInitCudaDeviceData(c, m_c, iend);
@@ -36,17 +30,20 @@ namespace stream
   deallocCudaDeviceData(a); \
   deallocCudaDeviceData(c);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void copy(Real_ptr c, Real_ptr a,
                      Index_type iend)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < iend) {
     COPY_BODY;
   }
 }
 
 
-void COPY::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void COPY::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -62,7 +59,7 @@ void COPY::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      copy<<<grid_size, block_size>>>( c, a,
+      copy<block_size><<<grid_size, block_size>>>( c, a,
                                        iend );
       cudaErrchk( cudaGetLastError() );
 
@@ -79,7 +76,7 @@ void COPY::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      lambda_cuda_forall<<<grid_size, block_size>>>(
+      lambda_cuda_forall<block_size><<<grid_size, block_size>>>(
         ibegin, iend, [=] __device__ (Index_type i) {
         COPY_BODY;
       });
@@ -108,11 +105,12 @@ void COPY::runCudaVariant(VariantID vid)
     COPY_DATA_TEARDOWN_CUDA;
 
   } else {
-      std::cout << "\n  COPY : Unknown Cuda variant id = " << vid << std::endl;
+      getCout() << "\n  COPY : Unknown Cuda variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(COPY, Cuda)
+
 } // end namespace stream
 } // end namespace rajaperf
 
diff --git a/src/stream/COPY-Hip.cpp b/src/stream/COPY-Hip.cpp
index 124f880fc..fe302a7fc 100644
--- a/src/stream/COPY-Hip.cpp
+++ b/src/stream/COPY-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace stream
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define COPY_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(a, m_a, iend); \
   allocAndInitHipDeviceData(c, m_c, iend);
@@ -36,17 +30,20 @@ namespace stream
   deallocHipDeviceData(a); \
   deallocHipDeviceData(c);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void copy(Real_ptr c, Real_ptr a,
                      Index_type iend)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < iend) {
     COPY_BODY;
   }
 }
 
 
-void COPY::runHipVariant(VariantID vid)
+template < size_t block_size >
+void COPY::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -62,7 +59,7 @@ void COPY::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((copy), dim3(grid_size), dim3(block_size), 0, 0,
+      hipLaunchKernelGGL((copy<block_size>), dim3(grid_size), dim3(block_size), 0, 0,
           c, a, iend );
       hipErrchk( hipGetLastError() );
 
@@ -83,7 +80,7 @@ void COPY::runHipVariant(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL(lambda_hip_forall<decltype(copy_lambda)>,
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(copy_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, copy_lambda);
       hipErrchk( hipGetLastError() );
 
@@ -110,11 +107,12 @@ void COPY::runHipVariant(VariantID vid)
     COPY_DATA_TEARDOWN_HIP;
 
   } else {
-      std::cout << "\n  COPY : Unknown Hip variant id = " << vid << std::endl;
+      getCout() << "\n  COPY : Unknown Hip variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(COPY, Hip)
+
 } // end namespace stream
 } // end namespace rajaperf
 
diff --git a/src/stream/COPY-OMP.cpp b/src/stream/COPY-OMP.cpp
index 8c023ed3b..c1b38e25f 100644
--- a/src/stream/COPY-OMP.cpp
+++ b/src/stream/COPY-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace stream
 {
 
 
-void COPY::runOpenMPVariant(VariantID vid)
+void COPY::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -81,12 +81,12 @@ void COPY::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  COPY : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  COPY : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/stream/COPY-OMPTarget.cpp b/src/stream/COPY-OMPTarget.cpp
index 8ba9d7ef3..823a32b13 100644
--- a/src/stream/COPY-OMPTarget.cpp
+++ b/src/stream/COPY-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace stream
 {
@@ -39,7 +39,7 @@ namespace stream
   deallocOpenMPDeviceData(c, did);
 
 
-void COPY::runOpenMPTargetVariant(VariantID vid)
+void COPY::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -83,7 +83,7 @@ void COPY::runOpenMPTargetVariant(VariantID vid)
     COPY_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  COPY : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  COPY : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/stream/COPY-Seq.cpp b/src/stream/COPY-Seq.cpp
index 72bd9485d..a807c0bee 100644
--- a/src/stream/COPY-Seq.cpp
+++ b/src/stream/COPY-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace stream
 {
 
 
-void COPY::runSeqVariant(VariantID vid)
+void COPY::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -79,7 +79,7 @@ void COPY::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  COPY : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  COPY : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp
index d8c7ec1d6..251208a4d 100644
--- a/src/stream/COPY.cpp
+++ b/src/stream/COPY.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,7 @@ COPY::COPY(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * 
+  setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) *
                   getActualProblemSize() );
   setFLOPsPerRep(0);
 
@@ -58,18 +58,18 @@ COPY::~COPY()
 {
 }
 
-void COPY::setUp(VariantID vid)
+void COPY::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitData(m_a, getActualProblemSize(), vid);
   allocAndInitDataConst(m_c, getActualProblemSize(), 0.0, vid);
 }
 
-void COPY::updateChecksum(VariantID vid)
+void COPY::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_c, getActualProblemSize());
+  checksum[vid][tune_idx] += calcChecksum(m_c, getActualProblemSize());
 }
 
-void COPY::tearDown(VariantID vid)
+void COPY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_a);
diff --git a/src/stream/COPY.hpp b/src/stream/COPY.hpp
index 010a391c8..0f23bfa68 100644
--- a/src/stream/COPY.hpp
+++ b/src/stream/COPY.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -42,17 +42,27 @@ class COPY : public KernelBase
 
   ~COPY();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_a;
   Real_ptr m_c;
 };
diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp
index ebeb2ca3a..de23c290b 100644
--- a/src/stream/DOT-Cuda.cpp
+++ b/src/stream/DOT-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,12 +22,6 @@ namespace rajaperf
 namespace stream
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define DOT_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(a, m_a, iend); \
   allocAndInitCudaDeviceData(b, m_b, iend);
@@ -36,21 +30,23 @@ namespace stream
   deallocCudaDeviceData(a); \
   deallocCudaDeviceData(b);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void dot(Real_ptr a, Real_ptr b,
                     Real_ptr dprod, Real_type dprod_init,
                     Index_type iend)
 {
   extern __shared__ Real_type pdot[ ];
 
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
 
   pdot[ threadIdx.x ] = dprod_init;
-  for ( ; i < iend ; i += gridDim.x * blockDim.x ) {
+  for ( ; i < iend ; i += gridDim.x * block_size ) {
     pdot[ threadIdx.x ] += a[ i ] * b[i];
   }
   __syncthreads();
 
-  for ( i = blockDim.x / 2; i > 0; i /= 2 ) {
+  for ( i = block_size / 2; i > 0; i /= 2 ) {
     if ( threadIdx.x < i ) {
       pdot[ threadIdx.x ] += pdot[ threadIdx.x + i ];
     }
@@ -70,7 +66,8 @@ __global__ void dot(Real_ptr a, Real_ptr b,
 }
 
 
-void DOT::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void DOT::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -91,10 +88,8 @@ void DOT::runCudaVariant(VariantID vid)
       initCudaDeviceData(dprod, &m_dot_init, 1);
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      dot<<<grid_size, block_size,
-            sizeof(Real_type)*block_size>>>( a, b,
-                                             dprod, m_dot_init,
-                                             iend );
+      dot<block_size><<<grid_size, block_size, sizeof(Real_type)*block_size>>>(
+          a, b, dprod, m_dot_init, iend );
       cudaErrchk( cudaGetLastError() );
 
       Real_type lprod;
@@ -131,10 +126,12 @@ void DOT::runCudaVariant(VariantID vid)
     DOT_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  DOT : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  DOT : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DOT, Cuda)
+
 } // end namespace stream
 } // end namespace rajaperf
 
diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp
index 47d4ad9b5..3e75e64ef 100644
--- a/src/stream/DOT-Hip.cpp
+++ b/src/stream/DOT-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,12 +22,6 @@ namespace rajaperf
 namespace stream
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define DOT_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(a, m_a, iend); \
   allocAndInitHipDeviceData(b, m_b, iend);
@@ -36,21 +30,23 @@ namespace stream
   deallocHipDeviceData(a); \
   deallocHipDeviceData(b);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void dot(Real_ptr a, Real_ptr b,
                     Real_ptr dprod, Real_type dprod_init,
                     Index_type iend)
 {
   HIP_DYNAMIC_SHARED( Real_type, pdot)
 
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
 
   pdot[ threadIdx.x ] = dprod_init;
-  for ( ; i < iend ; i += gridDim.x * blockDim.x ) {
+  for ( ; i < iend ; i += gridDim.x * block_size ) {
     pdot[ threadIdx.x ] += a[ i ] * b[i];
   }
   __syncthreads();
 
-  for ( i = blockDim.x / 2; i > 0; i /= 2 ) {
+  for ( i = block_size / 2; i > 0; i /= 2 ) {
     if ( threadIdx.x < i ) {
       pdot[ threadIdx.x ] += pdot[ threadIdx.x + i ];
     }
@@ -71,7 +67,8 @@ __global__ void dot(Real_ptr a, Real_ptr b,
 }
 
 
-void DOT::runHipVariant(VariantID vid)
+template < size_t block_size >
+void DOT::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -92,9 +89,9 @@ void DOT::runHipVariant(VariantID vid)
       initHipDeviceData(dprod, &m_dot_init, 1);
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((dot), dim3(grid_size), dim3(block_size), sizeof(Real_type)*block_size, 0,  a, b,
-                                             dprod, m_dot_init,
-                                             iend );
+      hipLaunchKernelGGL((dot<block_size>), dim3(grid_size), dim3(block_size),
+                                            sizeof(Real_type)*block_size, 0,
+                         a, b, dprod, m_dot_init, iend );
       hipErrchk( hipGetLastError() );
 
       Real_type lprod;
@@ -131,10 +128,12 @@ void DOT::runHipVariant(VariantID vid)
     DOT_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  DOT : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  DOT : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DOT, Hip)
+
 } // end namespace stream
 } // end namespace rajaperf
 
diff --git a/src/stream/DOT-OMP.cpp b/src/stream/DOT-OMP.cpp
index 6d7c6b77d..efd8e9ffa 100644
--- a/src/stream/DOT-OMP.cpp
+++ b/src/stream/DOT-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace stream
 {
 
 
-void DOT::runOpenMPVariant(VariantID vid)
+void DOT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -95,12 +95,12 @@ void DOT::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  DOT : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  DOT : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/stream/DOT-OMPTarget.cpp b/src/stream/DOT-OMPTarget.cpp
index e74c50acc..27f96a5d1 100644
--- a/src/stream/DOT-OMPTarget.cpp
+++ b/src/stream/DOT-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace stream
 {
@@ -37,7 +37,7 @@ namespace stream
   deallocOpenMPDeviceData(a, did); \
   deallocOpenMPDeviceData(b, did);
 
-void DOT::runOpenMPTargetVariant(VariantID vid)
+void DOT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -90,7 +90,7 @@ void DOT::runOpenMPTargetVariant(VariantID vid)
     DOT_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  DOT : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  DOT : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/stream/DOT-Seq.cpp b/src/stream/DOT-Seq.cpp
index 5baf26592..cde2263de 100644
--- a/src/stream/DOT-Seq.cpp
+++ b/src/stream/DOT-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace stream
 {
 
 
-void DOT::runSeqVariant(VariantID vid)
+void DOT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -93,7 +93,7 @@ void DOT::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  DOT : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  DOT : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp
index cca4aae4a..0d9657a8a 100644
--- a/src/stream/DOT.cpp
+++ b/src/stream/DOT.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -29,7 +29,7 @@ DOT::DOT(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
   setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) +
-                  (0*sizeof(Real_type) + 2*sizeof(Real_type)) * 
+                  (0*sizeof(Real_type) + 2*sizeof(Real_type)) *
                   getActualProblemSize() );
   setFLOPsPerRep(2 * getActualProblemSize());
 
@@ -58,7 +58,7 @@ DOT::~DOT()
 {
 }
 
-void DOT::setUp(VariantID vid)
+void DOT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitData(m_a, getActualProblemSize(), vid);
   allocAndInitData(m_b, getActualProblemSize(), vid);
@@ -67,12 +67,12 @@ void DOT::setUp(VariantID vid)
   m_dot_init = 0.0;
 }
 
-void DOT::updateChecksum(VariantID vid)
+void DOT::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += m_dot;
+  checksum[vid][tune_idx] += m_dot;
 }
 
-void DOT::tearDown(VariantID vid)
+void DOT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_a);
diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp
index adb9309c4..64d70c630 100644
--- a/src/stream/DOT.hpp
+++ b/src/stream/DOT.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -42,17 +42,27 @@ class DOT : public KernelBase
 
   ~DOT();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_a;
   Real_ptr m_b;
   Real_type m_dot;
diff --git a/src/stream/MUL-Cuda.cpp b/src/stream/MUL-Cuda.cpp
index 01ccf4956..8db12d087 100644
--- a/src/stream/MUL-Cuda.cpp
+++ b/src/stream/MUL-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace stream
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define MUL_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(b, m_b, iend); \
   allocAndInitCudaDeviceData(c, m_c, iend);
@@ -36,16 +30,20 @@ namespace stream
   deallocCudaDeviceData(b); \
   deallocCudaDeviceData(c)
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void mul(Real_ptr b, Real_ptr c, Real_type alpha,
                     Index_type iend)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < iend) {
     MUL_BODY;
   }
 }
 
-void MUL::runCudaVariant(VariantID vid)
+
+template < size_t block_size >
+void MUL::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -61,7 +59,7 @@ void MUL::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      mul<<<grid_size, block_size>>>( b, c, alpha,
+      mul<block_size><<<grid_size, block_size>>>( b, c, alpha,
                                       iend );
       cudaErrchk( cudaGetLastError() );
 
@@ -78,7 +76,7 @@ void MUL::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      lambda_cuda_forall<<<grid_size, block_size>>>(
+      lambda_cuda_forall<block_size><<<grid_size, block_size>>>(
         ibegin, iend, [=] __device__ (Index_type i) {
         MUL_BODY;
       });
@@ -107,10 +105,12 @@ void MUL::runCudaVariant(VariantID vid)
     MUL_DATA_TEARDOWN_CUDA;
 
   } else {
-     std::cout << "\n  MUL : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n  MUL : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MUL, Cuda)
+
 } // end namespace stream
 } // end namespace rajaperf
 
diff --git a/src/stream/MUL-Hip.cpp b/src/stream/MUL-Hip.cpp
index bdb5ca0eb..3e5e3f9f0 100644
--- a/src/stream/MUL-Hip.cpp
+++ b/src/stream/MUL-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace stream
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define MUL_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(b, m_b, iend); \
   allocAndInitHipDeviceData(c, m_c, iend);
@@ -36,16 +30,20 @@ namespace stream
   deallocHipDeviceData(b); \
   deallocHipDeviceData(c)
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void mul(Real_ptr b, Real_ptr c, Real_type alpha,
                     Index_type iend)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < iend) {
     MUL_BODY;
   }
 }
 
-void MUL::runHipVariant(VariantID vid)
+
+template < size_t block_size >
+void MUL::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -61,7 +59,7 @@ void MUL::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((mul), dim3(grid_size), dim3(block_size), 0, 0,  b, c, alpha,
+      hipLaunchKernelGGL((mul<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  b, c, alpha,
                                       iend );
       hipErrchk( hipGetLastError() );
 
@@ -82,7 +80,7 @@ void MUL::runHipVariant(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL(lambda_hip_forall<decltype(mul_lambda)>,
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(mul_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, mul_lambda);
       hipErrchk( hipGetLastError() );
 
@@ -109,10 +107,12 @@ void MUL::runHipVariant(VariantID vid)
     MUL_DATA_TEARDOWN_HIP;
 
   } else {
-     std::cout << "\n  MUL : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n  MUL : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MUL, Hip)
+
 } // end namespace stream
 } // end namespace rajaperf
 
diff --git a/src/stream/MUL-OMP.cpp b/src/stream/MUL-OMP.cpp
index d5d552f8f..7b78bf819 100644
--- a/src/stream/MUL-OMP.cpp
+++ b/src/stream/MUL-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace stream
 {
 
 
-void MUL::runOpenMPVariant(VariantID vid)
+void MUL::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -81,12 +81,12 @@ void MUL::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  MUL : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  MUL : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/stream/MUL-OMPTarget.cpp b/src/stream/MUL-OMPTarget.cpp
index 53d018d64..7e3141c78 100644
--- a/src/stream/MUL-OMPTarget.cpp
+++ b/src/stream/MUL-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace stream
 {
@@ -38,7 +38,7 @@ namespace stream
   deallocOpenMPDeviceData(b, did); \
   deallocOpenMPDeviceData(c, did);
 
-void MUL::runOpenMPTargetVariant(VariantID vid)
+void MUL::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -82,7 +82,7 @@ void MUL::runOpenMPTargetVariant(VariantID vid)
     MUL_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  MUL : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  MUL : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/stream/MUL-Seq.cpp b/src/stream/MUL-Seq.cpp
index bfb1154ce..837d26147 100644
--- a/src/stream/MUL-Seq.cpp
+++ b/src/stream/MUL-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace stream
 {
 
 
-void MUL::runSeqVariant(VariantID vid)
+void MUL::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -79,7 +79,7 @@ void MUL::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  MUL : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  MUL : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp
index 6b167de04..55eced2b0 100644
--- a/src/stream/MUL.cpp
+++ b/src/stream/MUL.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,7 @@ MUL::MUL(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * 
+  setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) *
                   getActualProblemSize() );
   setFLOPsPerRep(1 * getActualProblemSize());
 
@@ -58,19 +58,19 @@ MUL::~MUL()
 {
 }
 
-void MUL::setUp(VariantID vid)
+void MUL::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_b, getActualProblemSize(), 0.0, vid);
   allocAndInitData(m_c, getActualProblemSize(), vid);
   initData(m_alpha, vid);
 }
 
-void MUL::updateChecksum(VariantID vid)
+void MUL::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_b, getActualProblemSize());
+  checksum[vid][tune_idx] += calcChecksum(m_b, getActualProblemSize());
 }
 
-void MUL::tearDown(VariantID vid)
+void MUL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_b);
diff --git a/src/stream/MUL.hpp b/src/stream/MUL.hpp
index f8fcefbcb..1e79e17f9 100644
--- a/src/stream/MUL.hpp
+++ b/src/stream/MUL.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -43,17 +43,27 @@ class MUL : public KernelBase
 
   ~MUL();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_b;
   Real_ptr m_c;
   Real_type m_alpha;
diff --git a/src/stream/TRIAD-Cuda.cpp b/src/stream/TRIAD-Cuda.cpp
index 19175d80d..234683493 100644
--- a/src/stream/TRIAD-Cuda.cpp
+++ b/src/stream/TRIAD-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace stream
 {
 
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
 #define TRIAD_DATA_SETUP_CUDA \
   allocAndInitCudaDeviceData(a, m_a, iend); \
   allocAndInitCudaDeviceData(b, m_b, iend); \
@@ -38,17 +32,20 @@ namespace stream
   deallocCudaDeviceData(b); \
   deallocCudaDeviceData(c);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void triad(Real_ptr a, Real_ptr b, Real_ptr c, Real_type alpha,
                       Index_type iend)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < iend) {
     TRIAD_BODY;
   }
 }
 
 
-void TRIAD::runCudaVariant(VariantID vid)
+template < size_t block_size >
+void TRIAD::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -64,7 +61,7 @@ void TRIAD::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      triad<<<grid_size, block_size>>>( a, b, c, alpha,
+      triad<block_size><<<grid_size, block_size>>>( a, b, c, alpha,
                                         iend );
       cudaErrchk( cudaGetLastError() );
 
@@ -81,7 +78,7 @@ void TRIAD::runCudaVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      lambda_cuda_forall<<<grid_size, block_size>>>(
+      lambda_cuda_forall<block_size><<<grid_size, block_size>>>(
         ibegin, iend, [=] __device__ (Index_type i) {
         TRIAD_BODY;
       });
@@ -110,11 +107,12 @@ void TRIAD::runCudaVariant(VariantID vid)
     TRIAD_DATA_TEARDOWN_CUDA;
 
   } else {
-      std::cout << "\n  TRIAD : Unknown Cuda variant id = " << vid << std::endl;
+      getCout() << "\n  TRIAD : Unknown Cuda variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(TRIAD, Cuda)
+
 } // end namespace stream
 } // end namespace rajaperf
 
diff --git a/src/stream/TRIAD-Hip.cpp b/src/stream/TRIAD-Hip.cpp
index deaf20d0f..740727530 100644
--- a/src/stream/TRIAD-Hip.cpp
+++ b/src/stream/TRIAD-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,6 @@ namespace rajaperf
 namespace stream
 {
 
-  //
-  // Define thread block size for HIP execution
-  //
-  const size_t block_size = 256;
-
-
 #define TRIAD_DATA_SETUP_HIP \
   allocAndInitHipDeviceData(a, m_a, iend); \
   allocAndInitHipDeviceData(b, m_b, iend); \
@@ -38,17 +32,20 @@ namespace stream
   deallocHipDeviceData(b); \
   deallocHipDeviceData(c);
 
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void triad(Real_ptr a, Real_ptr b, Real_ptr c, Real_type alpha,
                       Index_type iend)
 {
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < iend) {
     TRIAD_BODY;
   }
 }
 
 
-void TRIAD::runHipVariant(VariantID vid)
+template < size_t block_size >
+void TRIAD::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -64,7 +61,7 @@ void TRIAD::runHipVariant(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((triad), dim3(grid_size), dim3(block_size), 0, 0,  a, b, c, alpha,
+      hipLaunchKernelGGL((triad<block_size>), dim3(grid_size), dim3(block_size), 0, 0,  a, b, c, alpha,
                                         iend );
       hipErrchk( hipGetLastError() );
 
@@ -85,7 +82,7 @@ void TRIAD::runHipVariant(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL(lambda_hip_forall<decltype(triad_lambda)>,
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(triad_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, triad_lambda);
       hipErrchk( hipGetLastError() );
 
@@ -112,11 +109,12 @@ void TRIAD::runHipVariant(VariantID vid)
     TRIAD_DATA_TEARDOWN_HIP;
 
   } else {
-      std::cout << "\n  TRIAD : Unknown Hip variant id = " << vid << std::endl;
+      getCout() << "\n  TRIAD : Unknown Hip variant id = " << vid << std::endl;
   }
-
 }
 
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(TRIAD, Hip)
+
 } // end namespace stream
 } // end namespace rajaperf
 
diff --git a/src/stream/TRIAD-OMP.cpp b/src/stream/TRIAD-OMP.cpp
index 1d2060ccf..f1c5c435d 100644
--- a/src/stream/TRIAD-OMP.cpp
+++ b/src/stream/TRIAD-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace stream
 {
 
 
-void TRIAD::runOpenMPVariant(VariantID vid)
+void TRIAD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -81,12 +81,12 @@ void TRIAD::runOpenMPVariant(VariantID vid)
     }
 
     default : {
-      std::cout << "\n  TRIAD : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  TRIAD : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#else 
+#else
   RAJA_UNUSED_VAR(vid);
 #endif
 }
diff --git a/src/stream/TRIAD-OMPTarget.cpp b/src/stream/TRIAD-OMPTarget.cpp
index 9d0d67145..c69e6cdbb 100644
--- a/src/stream/TRIAD-OMPTarget.cpp
+++ b/src/stream/TRIAD-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -40,7 +40,7 @@ namespace stream
   deallocOpenMPDeviceData(b, did); \
   deallocOpenMPDeviceData(c, did);
 
-void TRIAD::runOpenMPTargetVariant(VariantID vid)
+void TRIAD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -84,7 +84,7 @@ void TRIAD::runOpenMPTargetVariant(VariantID vid)
     TRIAD_DATA_TEARDOWN_OMP_TARGET;
 
   } else {
-     std::cout << "\n  TRIAD : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n  TRIAD : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
diff --git a/src/stream/TRIAD-Seq.cpp b/src/stream/TRIAD-Seq.cpp
index dfbac0188..0477202c0 100644
--- a/src/stream/TRIAD-Seq.cpp
+++ b/src/stream/TRIAD-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,13 +12,13 @@
 
 #include <iostream>
 
-namespace rajaperf 
+namespace rajaperf
 {
 namespace stream
 {
 
 
-void TRIAD::runSeqVariant(VariantID vid)
+void TRIAD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -79,7 +79,7 @@ void TRIAD::runSeqVariant(VariantID vid)
 #endif // RUN_RAJA_SEQ
 
     default : {
-      std::cout << "\n  TRIAD : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n  TRIAD : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp
index dfa04eda0..543b19642 100644
--- a/src/stream/TRIAD.cpp
+++ b/src/stream/TRIAD.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,7 @@ TRIAD::TRIAD(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * 
+  setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) *
                   getActualProblemSize() );
   setFLOPsPerRep(2 * getActualProblemSize());
 
@@ -62,7 +62,7 @@ TRIAD::~TRIAD()
 {
 }
 
-void TRIAD::setUp(VariantID vid)
+void TRIAD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   allocAndInitDataConst(m_a, getActualProblemSize(), 0.0, vid);
   allocAndInitData(m_b, getActualProblemSize(), vid);
@@ -70,12 +70,12 @@ void TRIAD::setUp(VariantID vid)
   initData(m_alpha, vid);
 }
 
-void TRIAD::updateChecksum(VariantID vid)
+void TRIAD::updateChecksum(VariantID vid, size_t tune_idx)
 {
-  checksum[vid] += calcChecksum(m_a, getActualProblemSize(), checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize(), checksum_scale_factor );
 }
 
-void TRIAD::tearDown(VariantID vid)
+void TRIAD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
   deallocData(m_a);
diff --git a/src/stream/TRIAD.hpp b/src/stream/TRIAD.hpp
index 8d2f01236..80685ce3c 100644
--- a/src/stream/TRIAD.hpp
+++ b/src/stream/TRIAD.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -44,17 +44,27 @@ class TRIAD : public KernelBase
 
   ~TRIAD();
 
-  void setUp(VariantID vid);
-  void updateChecksum(VariantID vid);
-  void tearDown(VariantID vid);
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
 
-  void runSeqVariant(VariantID vid);
-  void runOpenMPVariant(VariantID vid);
-  void runCudaVariant(VariantID vid);
-  void runHipVariant(VariantID vid);
-  void runOpenMPTargetVariant(VariantID vid);
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
 
 private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+
   Real_ptr m_a;
   Real_ptr m_b;
   Real_ptr m_c;
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
new file mode 100644
index 000000000..fe0b732f5
--- /dev/null
+++ b/test/CMakeLists.txt
@@ -0,0 +1,25 @@
+###############################################################################
+# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+# and RAJA Performance Suite project contributors.
+# See the RAJAPerf/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+set(RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS
+    common
+    apps
+    basic
+    lcals
+    polybench
+    stream
+    algorithm)
+list(APPEND RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS})
+ 
+raja_add_test(
+  NAME test-raja-perf-suite
+  SOURCES test-raja-perf-suite.cpp
+  DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS}
+  )
+
+target_include_directories(test-raja-perf-suite.exe PRIVATE ${PROJECT_SOURCE_DIR}/src)
diff --git a/test/test-raja-perf-suite.cpp b/test/test-raja-perf-suite.cpp
new file mode 100644
index 000000000..60dbd7a29
--- /dev/null
+++ b/test/test-raja-perf-suite.cpp
@@ -0,0 +1,136 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "gtest/gtest.h"
+
+#include "common/Executor.hpp"
+#include "common/KernelBase.hpp"
+
+#include <vector>
+#include <string>
+#include <iostream>
+#include <cmath>
+
+TEST(ShortSuiteTest, Basic)
+{
+
+// Assemble command line args for basic test
+  int argc = 4;
+
+#if defined(RAJA_ENABLE_HIP) && \
+     (HIP_VERSION_MAJOR < 5 || \
+     (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 1))
+  argc = 6;
+#endif
+
+#if (defined(RAJA_COMPILER_CLANG) && __clang_major__ == 11)
+  argc = 6;
+#endif
+
+  std::vector< std::string > sargv(argc);
+  sargv[0] = std::string("dummy ");  // for executable name
+  sargv[1] = std::string("--checkrun");
+  sargv[2] = std::string("5");
+  sargv[3] = std::string("--show-progress");
+
+#if defined(RAJA_ENABLE_HIP) && \
+     (HIP_VERSION_MAJOR < 5 || \
+     (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 1))
+  sargv[4] = std::string("--exclude-kernels");
+  sargv[5] = std::string("HALOEXCHANGE_FUSED");
+#endif
+
+#if (defined(RAJA_COMPILER_CLANG) && __clang_major__ == 11)
+  sargv[4] = std::string("--exclude-kernels");
+  sargv[5] = std::string("FIRST_MIN");
+#endif
+
+  char** argv = new char* [argc];
+  for (int is = 0; is < argc; ++is) { 
+    argv[is] = const_cast<char*>(sargv[is].c_str());
+  }
+
+  // STEP 1: Create suite executor object with input args defined above
+  rajaperf::Executor executor(argc, argv);
+
+  // STEP 2: Assemble kernels and variants to run
+  executor.setupSuite();
+
+  // STEP 3: Report suite run summary
+  executor.reportRunSummary(std::cout);
+
+  // STEP 4: Execute suite
+  executor.runSuite();
+
+  // STEP 5: Access suite run data and run through checks
+  std::vector<rajaperf::KernelBase*> kernels = executor.getKernels();
+  std::vector<rajaperf::VariantID> variant_ids = executor.getVariantIDs();
+
+
+  for (size_t ik = 0; ik < kernels.size(); ++ik) {
+
+    rajaperf::KernelBase* kernel = kernels[ik];
+
+    // 
+    // Get reference checksum (first kernel variant run)
+    //
+    rajaperf::Checksum_type cksum_ref = 0.0;
+    size_t ivck = 0;
+    bool found_ref = false;
+    while ( ivck < variant_ids.size() && !found_ref ) {
+
+      rajaperf::VariantID vid = variant_ids[ivck];
+      size_t num_tunings = kernel->getNumVariantTunings(vid);
+      for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) {
+        if ( kernel->wasVariantTuningRun(vid, tune_idx) ) {
+          cksum_ref = kernel->getChecksum(vid, tune_idx);
+          found_ref = true;
+          break;
+        }
+      }
+      ++ivck;
+
+    } // while loop over variants until reference checksum found
+
+
+    //
+    // Check execution time is greater than zero and checksum diff is 
+    // within tolerance for each variant run.
+    // 
+    for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
+
+      rajaperf::VariantID vid = variant_ids[iv];
+
+      size_t num_tunings = kernel->getNumVariantTunings(variant_ids[iv]);
+      for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) {
+        if ( kernel->wasVariantTuningRun(vid, tune_idx) ) {
+
+          double rtime = kernel->getTotTime(vid, tune_idx);
+
+          rajaperf::Checksum_type cksum = kernel->getChecksum(vid, tune_idx); 
+          rajaperf::Checksum_type cksum_diff = std::abs(cksum_ref - cksum);
+
+          // Print kernel information when running test manually
+          std::cout << "Check kernel, variant, tuning : "
+                    << kernel->getName() << " , "
+                    << rajaperf::getVariantName(vid) << " , "
+                    << kernel->getVariantTuningName(vid, tune_idx) 
+                    << std::endl;
+          EXPECT_GT(rtime, 0.0);
+          EXPECT_LT(cksum_diff, 1e-7);
+          
+        }
+      } 
+
+    }  // loop over variants
+
+  } // loop over kernels
+
+  // clean up 
+  delete [] argv; 
+}
diff --git a/tpl/RAJA b/tpl/RAJA
index 357933a42..87a5cac67 160000
--- a/tpl/RAJA
+++ b/tpl/RAJA
@@ -1 +1 @@
-Subproject commit 357933a42842dd91de5c1034204d937fce0a2a44
+Subproject commit 87a5cac67214e5e96c941bd652b1c0981e9f2123