ginkgo-project · pratikvn · Oct 12, 2023 · Oct 1, 2023 · Oct 2, 2023 · Oct 3, 2023
diff --git a/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc
@@ -0,0 +1,78 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+template <typename ValueType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const batch::matrix::Dense<ValueType>* mat,
+                  const batch::MultiVector<ValueType>* b,
+                  batch::MultiVector<ValueType>* x)
+{
+    const auto num_blocks = mat->get_num_batch_items();
+    const auto b_ub = get_batch_struct(b);
+    const auto x_ub = get_batch_struct(x);
+    const auto mat_ub = get_batch_struct(mat);
+    if (b->get_common_size()[1] > 1) {
+        GKO_NOT_IMPLEMENTED;
+    }
+    simple_apply_kernel<<<num_blocks, default_block_size, 0,
+                          exec->get_stream()>>>(mat_ub, b_ub, x_ub);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL);
+
+
+template <typename ValueType>
+void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
+                    const batch::MultiVector<ValueType>* alpha,
+                    const batch::matrix::Dense<ValueType>* mat,
+                    const batch::MultiVector<ValueType>* b,
+                    const batch::MultiVector<ValueType>* beta,
+                    batch::MultiVector<ValueType>* x)
+{
+    const auto num_blocks = mat->get_num_batch_items();
+    const auto b_ub = get_batch_struct(b);
+    const auto x_ub = get_batch_struct(x);
+    const auto mat_ub = get_batch_struct(mat);
+    const auto alpha_ub = get_batch_struct(alpha);
+    const auto beta_ub = get_batch_struct(beta);
+    if (b->get_common_size()[1] > 1) {
+        GKO_NOT_IMPLEMENTED;
+    }
+    advanced_apply_kernel<<<num_blocks, default_block_size, 0,
+                            exec->get_stream()>>>(alpha_ub, mat_ub, b_ub,
+                                                  beta_ub, x_ub);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL);
diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc
@@ -0,0 +1,164 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+template <typename ValueType>
+__device__ __forceinline__ void simple_apply(
+    const gko::batch::matrix::dense::batch_item<const ValueType>& mat,
+    const ValueType* const __restrict__ b, ValueType* const __restrict__ x)
+{
+    constexpr auto tile_size = config::warp_size;
+
+    auto thread_block = group::this_thread_block();
+    auto subgroup = group::tiled_partition<tile_size>(thread_block);
+    const auto subgroup_id = static_cast<int>(threadIdx.x / tile_size);
+    const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size);
+
+    for (int row = subgroup_id; row < mat.num_rows;
+         row += num_subgroups_per_block) {
+        ValueType temp = zero<ValueType>();
+        for (int j = subgroup.thread_rank(); j < mat.num_cols;
+             j += subgroup.size()) {
+            const ValueType val = mat.values[row * mat.stride + j];
+            temp += val * b[j];
+        }
+
+        // subgroup level reduction
+        temp = reduce(subgroup, temp, thrust::plus<ValueType>{});
+
+        if (subgroup.thread_rank() == 0) {
+            x[row] = temp;
+        }
+    }
+}
+
+template <typename ValueType>
+__global__ __launch_bounds__(
+    default_block_size,
+    sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix::
+                                                      dense::uniform_batch<
+                                                          const ValueType>
+                                                          mat,
+                                                  const gko::batch::
+                                                      multi_vector::
+                                                          uniform_batch<
+                                                              const ValueType>
+                                                              b,
+                                                  const gko::batch::
+                                                      multi_vector::
+                                                          uniform_batch<
+                                                              ValueType>
+                                                              x)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto mat_b =
+            gko::batch::matrix::extract_batch_item(mat, batch_id);
+        const auto b_b = gko::batch::extract_batch_item(b, batch_id);
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        simple_apply(mat_b, b_b.values, x_b.values);
+    }
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ void advanced_apply(
+    const ValueType alpha,
+    const gko::batch::matrix::dense::batch_item<const ValueType>& mat,
+    const ValueType* const __restrict__ b, const ValueType beta,
+    ValueType* const __restrict__ x)
+{
+    constexpr auto tile_size = config::warp_size;
+
+    auto thread_block = group::this_thread_block();
+    auto subgroup = group::tiled_partition<tile_size>(thread_block);
+    const auto subgroup_id = static_cast<int>(threadIdx.x / tile_size);
+    const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size);
+
+    for (int row = subgroup_id; row < mat.num_rows;
+         row += num_subgroups_per_block) {
+        ValueType temp = zero<ValueType>();
+        for (int j = subgroup.thread_rank(); j < mat.num_cols;
+             j += subgroup.size()) {
+            const ValueType val = mat.values[row * mat.stride + j];
+            temp += alpha * val * b[j];
+        }
+
+        // subgroup level reduction
+        temp = reduce(subgroup, temp, thrust::plus<ValueType>{});
+
+        if (subgroup.thread_rank() == 0) {
+            x[row] = temp + beta * x[row];
+        }
+    }
+}
+
+template <typename ValueType>
+__global__ __launch_bounds__(
+    default_block_size,
+    sm_oversubscription) void advanced_apply_kernel(const gko::batch::
+                                                        multi_vector::
+                                                            uniform_batch<
+                                                                const ValueType>
+                                                                alpha,
+                                                    const gko::batch::matrix::
+                                                        dense::uniform_batch<
+                                                            const ValueType>
+                                                            mat,
+                                                    const gko::batch::
+                                                        multi_vector::
+                                                            uniform_batch<
+                                                                const ValueType>
+                                                                b,
+                                                    const gko::batch::
+                                                        multi_vector::
+                                                            uniform_batch<
+                                                                const ValueType>
+                                                                beta,
+                                                    const gko::batch::
+                                                        multi_vector::
+                                                            uniform_batch<
+                                                                ValueType>
+                                                                x)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto mat_b =
+            gko::batch::matrix::extract_batch_item(mat, batch_id);
+        const auto b_b = gko::batch::extract_batch_item(b, batch_id);
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
+        const auto beta_b = gko::batch::extract_batch_item(beta, batch_id);
+        advanced_apply(alpha_b.values[0], mat_b, b_b.values, beta_b.values[0],
+                       x_b.values);
+    }
+}
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
@@ -39,6 +39,7 @@ target_sources(ginkgo
     log/vtune.cpp
     log/record.cpp
     log/stream.cpp
+    matrix/batch_dense.cpp
     matrix/coo.cpp
     matrix/csr.cpp
     matrix/dense.cpp

diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp
@@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
 #include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
 
 
 #include "core/base/batch_multi_vector_kernels.hpp"
@@ -72,7 +73,7 @@ namespace detail {
 
 template <typename ValueType>
 batch_dim<2> compute_batch_size(
-    const std::vector<matrix::Dense<ValueType>*>& matrices)
+    const std::vector<gko::matrix::Dense<ValueType>*>& matrices)
 {
     auto common_size = matrices[0]->get_size();
     for (size_type i = 1; i < matrices.size(); ++i) {
@@ -86,7 +87,7 @@ batch_dim<2> compute_batch_size(
 
 
 template <typename ValueType>
-std::unique_ptr<matrix::Dense<ValueType>>
+std::unique_ptr<gko::matrix::Dense<ValueType>>
 MultiVector<ValueType>::create_view_for_item(size_type item_id)
 {
     auto exec = this->get_executor();
@@ -102,7 +103,7 @@ MultiVector<ValueType>::create_view_for_item(size_type item_id)
 
 
 template <typename ValueType>
-std::unique_ptr<const matrix::Dense<ValueType>>
+std::unique_ptr<const gko::matrix::Dense<ValueType>>
 MultiVector<ValueType>::create_const_view_for_item(size_type item_id) const
 {
     auto exec = this->get_executor();
@@ -290,6 +291,27 @@ void MultiVector<ValueType>::move_to(
 }
 
 
+template <typename ValueType>
+void MultiVector<ValueType>::convert_to(matrix::Dense<ValueType>* result) const
+{
+    auto exec = result->get_executor() == nullptr ? this->get_executor()
+                                                  : result->get_executor();
+    auto tmp = gko::batch::matrix::Dense<ValueType>::create_const(
+        exec, this->get_size(),
+        make_const_array_view(this->get_executor(),
+                              this->get_num_stored_elements(),
+                              this->get_const_values()));
+    result->copy_from(tmp);
+}
+
+
+template <typename ValueType>
+void MultiVector<ValueType>::move_to(matrix::Dense<ValueType>* result)
+{
+    this->convert_to(result);
+}
+
+
 #define GKO_DECLARE_BATCH_MULTI_VECTOR(_type) class MultiVector<_type>
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR);
 

diff --git a/core/base/batch_multi_vector_kernels.hpp b/core/base/batch_multi_vector_kernels.hpp
@@ -39,7 +39,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/diagonal.hpp>
 
 
 #include "core/base/kernel_declaration.hpp"

diff --git a/core/base/batch_struct.hpp b/core/base/batch_struct.hpp
@@ -51,9 +51,9 @@ template <typename ValueType>
 struct batch_item {
     using value_type = ValueType;
     ValueType* values;
-    int stride;
-    int num_rows;
-    int num_rhs;
+    int32 stride;
+    int32 num_rows;
+    int32 num_rhs;
 };
 
 
@@ -67,9 +67,9 @@ struct uniform_batch {
 
     ValueType* values;
     size_type num_batch_items;
-    int stride;
-    int num_rows;
-    int num_rhs;
+    int32 stride;
+    int32 num_rows;
+    int32 num_rhs;
 
     size_type get_entry_storage() const
     {
@@ -117,8 +117,8 @@ extract_batch_item(const multi_vector::uniform_batch<ValueType>& batch,
 
 template <typename ValueType>
 GKO_ATTRIBUTES GKO_INLINE multi_vector::batch_item<ValueType>
-extract_batch_item(ValueType* const batch_values, const int stride,
-                   const int num_rows, const int num_rhs,
+extract_batch_item(ValueType* const batch_values, const int32 stride,
+                   const int32 num_rows, const int32 num_rhs,
                    const size_type batch_idx)
 {
     return {batch_values + batch_idx * stride * num_rows, stride, num_rows,