-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgpu_math.cuh
102 lines (87 loc) · 3.1 KB
/
gpu_math.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#ifndef GPU_MATH_H
#define GPU_MATH_H
#include <iostream>
namespace DIM_CUDA_HIDDEN {
/** Take matrix A and vector x and store the result to y.
* NOTE: for * overloading purposes, the multiplication operation is B<V> = A<T> * x<U>
* NOTE: matrix width/input size must be a multiple of 32
* \param[in] A pointer to input matrix
* \param[in] x pointer to input vector
* \param[in] y pointer to output vector
* \param[in] size width of the matrix/height of the input vector
* \param[in] zero value to start summing into y from
*/
template <class T, class U, class V>
__global__ void gpu_matMulti(T* A, U* x, V* y, int size, V zero) {
__shared__ V s[32];
__shared__ V u[32];
// i = location in input vector x
int i = threadIdx.x;
// j = location in output vector y
int j = blockIdx.x * 2;
// sum up the convolutions between i and i+32
V first_sum = zero;
V second_sum = zero;
for (int n = 0; n < size/32; n++) {
int ind = (j*size) + i + n;
float x_val = x[i+n];
first_sum += A[ind] * x_val;
second_sum += A[ind+size] * x_val;
}
// put into shared memory for mutual access
s[i] = first_sum;
u[i] = second_sum;
__syncthreads();
#pragma unroll
for (int w = 16; w > 1; w/=2) {
if (i < w) {
float my_sum = s[i];
my_sum += s[i+w];
s[i] = my_sum;
}
else if (31-i < w) {
int my_ind = 31-i;
float my_sum = u[my_ind];
my_sum += u[my_ind+w];
u[my_ind] = my_sum;
}
__syncthreads();
}
if (i == 0) {
float my_sum = s[0];
my_sum += s[1];
y[j] = my_sum;
}
else if (i == 1) {
float my_sum = u[0];
my_sum += u[1];
y[j+1] = my_sum;
}
};
}
namespace gpu {
/** Take matrix A and column vector x and store the product Ax into column vector y
* NOTE: for overloading purposes, multiplication operations use [V = T * U]
* NOTE: matrix width/input size must be a multiple of 32
* \param[in] A input matrix of type T
* \param[in] x input vector of type U
* \param[in] y output matrix of type V
* \param[in] zero value to start summing y values from
* \param[in] check whether to check that sizes match up (default: true)
*/
template <class T, class U, class V>
void matMulti(Matrix2D<T> &A, Vector1D<U> &x, Vector1D<V> &y, V zero, bool check=true) {
// check that sizes match up
if (check) {
if (A.width() != x.size()) std::cout << "ERROR: Matrix Multiplication input size does not match matrix!" << std::endl;
if (A.width() % 32 != 0) std::cout << "ERROR: Matrix Multiplication input size is not multiple of 32!" << std::endl;
if (A.height() != y.size()) std::cout << "ERROR: Matrix Multiplication input size does not match matrix!" << std::endl;
if (A.height() % 2 != 0) std::cout << "ERROR: Matrix Multiplication output size is not multiple of 2!" << std::endl;
}
// do multiplication
DIM_CUDA_HIDDEN::gpu_matMulti<T, U, V><<<A.height()/2, 32>>>(
A.get_data(), x.get_data(), y.get_data(), A.width(), zero
);
};
}
#endif