英文:
Determinant of a small complex matrix within a GPU thread
问题
问题是计算复杂矩阵(5x5、4x4和3x3)的行列式,要求在特定线程内的设备函数中完成。效率至关重要,因为原问题涉及在每个线程的循环内计算数千个这样的矩阵。
这里是使用直接展开
(参数为0运行)和余子式展开
(参数为1运行)方法的工作示例:
#include <stdio.h>
#include <complex.h>
#include <cstring>
#include <cuda.h>
#include <cuComplex.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define N 5
// ...(略过一部分代码)
__host__ __device__ static __inline__ cuDoubleComplex cuCfms(cuDoubleComplex x,
cuDoubleComplex y,
cuDoubleComplex d){
// ...(略过一部分代码)
}
__host__ __device__ static __inline__ cuDoubleComplex cuCaaaa(cuDoubleComplex x,
cuDoubleComplex y,
cuDoubleComplex u,
cuDoubleComplex v) {
// ...(略过一部分代码)
}
// ...(略过一部分代码)
__device__ cuDoubleComplex direct_det5 (cuDoubleComplex *B){
// ...(略过一部分代码)
}
__device__ cuDoubleComplex cuC_5pos (cuDoubleComplex a, cuDoubleComplex b,
cuDoubleComplex c, cuDoubleComplex d,
cuDoubleComplex e) {
// ...(略过一部分代码)
}
__device__ cuDoubleComplex cuC_5neg (cuDoubleComplex a, cuDoubleComplex b,
cuDoubleComplex c, cuDoubleComplex d,
cuDoubleComplex e) {
// ...(略过一部分代码)
}
// ...(略过一部分代码)
__device__ cuDoubleComplex det_5x5(cuDoubleComplex *matrix) {
// ...(略过一部分代码)
}
// ...(略过一部分代码)
__device__ cuDoubleComplex gpu_device_kernel(int i, int method) {
// ...(略过一部分代码)
}
// ...(略过一部分代码)
__global__ void gpu_global_kernel(cuDoubleComplex *d_res, int method) {
// ...(略过一部分代码)
}
int main(int argc, char *argv[]) {
// ...(略过一部分代码)
}
然而,我在考虑这些实现是否真的高效。 我正在寻找一种替代方法,可以在设备函数中计算一般矩阵的行列式。 我提供的示例是最简单直接的方法之一,但我正在寻找另一种方法,比如LU/Cholesky分解,可以从设备函数中调用它。 不幸的是,诸如LAPACK之类的包对于CUDA不可用,但也许来自其他一些软件包的类似兼容函数可以在CUDA设备环境中使用。 对于可以在CUDA设备上使用的类似函数的任何建议或示例将不胜感激!
英文:
The problem is to compute the determinant of a complex matrix (5x5, 4x4, and 3x3) from a device function within a particular thread. Efficiency is crucial, as the original problem involves computing thousands of such matrices in a for loop within each thread.
Here is a working example using direct expansion
(run with arg: 0) and cofactor expansion
(run with arg: 1) methods:
#include <stdio.h>
#include <complex.h>
#include <cstring>
#include <cuda.h>
#include <cuComplex.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define N 5
__host__ __device__ static __inline__ cuDoubleComplex cuCfms( cuDoubleComplex x,
cuDoubleComplex y,
cuDoubleComplex d){
double real_res;
double imag_res;
real_res = (cuCreal(x) * cuCreal(y)) - cuCreal(d);
imag_res = (cuCreal(x) * cuCimag(y)) - cuCimag(d);
real_res = -(cuCimag(x) * cuCimag(y)) + real_res;
imag_res = (cuCimag(x) * cuCreal(y)) + imag_res;
return make_cuDoubleComplex(real_res, imag_res);
}
__host__ __device__ static __inline__ cuDoubleComplex cuCaaaa(cuDoubleComplex x,
cuDoubleComplex y,
cuDoubleComplex u,
cuDoubleComplex v) {
return make_cuDoubleComplex (cuCreal(x) + cuCreal(y) + cuCreal(u) + cuCreal(v),
cuCimag(x) + cuCimag(y) + cuCimag(u) + cuCimag(v));
}
__host__ __device__ static __inline__ cuDoubleComplex cuCasas(cuDoubleComplex x,
cuDoubleComplex y,
cuDoubleComplex u,
cuDoubleComplex v) {
return make_cuDoubleComplex (cuCreal(x) - cuCreal(y) + cuCreal(u) - cuCreal(v),
cuCimag(x) - cuCimag(y) + cuCimag(u) - cuCimag(v));
}
__host__ __device__ static __inline__ cuDoubleComplex cuCasasa(cuDoubleComplex x,
cuDoubleComplex y,
cuDoubleComplex u,
cuDoubleComplex v,
cuDoubleComplex w) {
return make_cuDoubleComplex (cuCreal(x) - cuCreal(y) + cuCreal(u) - cuCreal(v) + cuCreal(w),
cuCimag(x) - cuCimag(y) + cuCimag(u) - cuCimag(v) + cuCimag(w) );
}
__device__ cuDoubleComplex cuC_5pos (cuDoubleComplex a, cuDoubleComplex b,
cuDoubleComplex c, cuDoubleComplex d,
cuDoubleComplex e) {
// B[12]*(B[18]*B[24] - B[19]*B[23])
//a*b*c -d*e
return cuCmul(a, cuCfms(b,c, cuCmul(d,e)));
}
__device__ cuDoubleComplex cuC_5neg (cuDoubleComplex a, cuDoubleComplex b,
cuDoubleComplex c, cuDoubleComplex d,
cuDoubleComplex e) {
// B[12]*(B[18]*B[24] - B[19]*B[23])
//a*b*c -d*e
return cuCmul(cuCmul(make_cuDoubleComplex(-1.0,0.0),a), cuCfms(b,c, cuCmul(d,e)));
}
__device__ cuDoubleComplex direct_det5 (cuDoubleComplex *B){
cuDoubleComplex det;
cuDoubleComplex det1 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det2 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det3 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det4 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det5 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det11 = cuCmul(B[0],cuCmul(B[6],cuCadd( cuC_5pos(B[12],B[18],B[24],B[19],B[23]),
cuCadd( cuC_5neg(B[13],B[17],B[24],B[22],B[19]),
cuC_5pos(B[14],B[17],B[23],B[22],B[18]) ) ) ) );
cuDoubleComplex det12 = cuCmul(B[0],cuCmul(B[11],cuCadd(cuC_5pos(B[7],B[18],B[24],B[19],B[23]),
cuCadd( cuC_5neg(B[8],B[17],B[24],B[22],B[19]),
cuC_5pos(B[9],B[17],B[23],B[22],B[18]) ) ) ) );
cuDoubleComplex det13 = cuCmul(B[0],cuCmul(B[16],cuCadd(cuC_5pos(B[7],B[13],B[24],B[23],B[14]),
cuCadd( cuC_5neg(B[8],B[12],B[24],B[22],B[14]),
cuC_5pos(B[9],B[12],B[23],B[22],B[13]) ) ) ) );
cuDoubleComplex det14 = cuCmul(B[0],cuCmul(B[21],cuCadd(cuC_5pos(B[7],B[13],B[19],B[18],B[14]),
cuCadd( cuC_5neg(B[8],B[12],B[19],B[17],B[14]),
cuC_5pos(B[9],B[12],B[18],B[13],B[17]) ) ) ) );
det1 = cuCasas(det11, det12, det13, det14);
cuDoubleComplex det21 = cuCmul(B[1*5+0],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[12],B[18],B[24],B[19],B[23]), cuCadd( cuC_5neg(B[13],B[17],B[24],B[22],B[19]), cuC_5pos(B[14],B[17],B[23],B[22],B[18]) ) ) ) );
cuDoubleComplex det22 = cuCmul(B[1*5+0],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[2],B[18],B[24],B[19],B[23]), cuCadd( cuC_5neg(B[3],B[17],B[24],B[22],B[19]), cuC_5pos(B[4],B[17],B[23],B[22],B[18]) ) ) ) );
cuDoubleComplex det23 = cuCmul(B[1*5+0],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[2],B[13],B[24],B[23],B[14]), cuCadd( cuC_5neg(B[3],B[12],B[24],B[22],B[14]), cuC_5pos(B[4],B[12],B[23],B[22],B[13]) ) ) ) );
cuDoubleComplex det24 = cuCmul(B[1*5+0],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[2],B[13],B[19],B[18],B[14]), cuCadd( cuC_5neg(B[3],B[12],B[19],B[17],B[14]), cuC_5pos(B[4],B[12],B[18],B[13],B[17]) ) ) ) );
det2 = cuCasas(det21, det22, det23, det24);
cuDoubleComplex det31 = cuCmul(B[10],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[3*5+3],B[4*5+4],B[3*5+4],B[4*5+3]),
cuCadd( cuC_5neg(B[1*5+3],B[3*5+2],B[4*5+4],B[4*5+2],B[3*5+4]),
cuC_5pos(B[1*5+4],B[3*5+2],B[4*5+3],B[4*5+2],B[3*5+3]) ) ) ) );
cuDoubleComplex det32 = cuCmul(B[10],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[3*5+3],B[4*5+4],B[3*5+4],B[4*5+3]),
cuCadd( cuC_5neg(B[0*5+3],B[3*5+2],B[4*5+4],B[4*5+2],B[3*5+4]),
cuC_5pos(B[0*5+4],B[3*5+2],B[4*5+3],B[4*5+2],B[3*5+3]) ) ) ) );
cuDoubleComplex det33 = cuCmul(B[10],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[4*5+4],B[4*5+3],B[1*5+4]),
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[4*5+4],B[4*5+2],B[1*5+4]),
cuC_5pos(B[0*5+4],B[1*5+2],B[4*5+3],B[4*5+2],B[1*5+3]) ) ) ) );
cuDoubleComplex det34 = cuCmul(B[10],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[3*5+4],B[3*5+3],B[1*5+4]),
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[3*5+4],B[3*5+2],B[1*5+4]),
cuC_5pos(B[0*5+4],B[1*5+2],B[3*5+3],B[1*5+3],B[3*5+2]) ) ) ) );
det3 = cuCasas(det31, det32, det33, det34);
cuDoubleComplex det41 = cuCmul(B[15],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[2*5+3],B[4*5+4],B[2*5+4],B[4*5+3]),
cuCadd( cuC_5neg(B[1*5+3],B[2*5+2],B[4*5+4],B[4*5+2],B[2*5+4]),
cuC_5pos(B[1*5+4],B[2*5+2],B[4*5+3],B[4*5+2],B[2*5+3]) ) ) ) );
cuDoubleComplex det42 = cuCmul(B[15],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[2*5+3],B[4*5+4],B[2*5+4],B[4*5+3]),
cuCadd( cuC_5neg(B[0*5+3],B[2*5+2],B[4*5+4],B[4*5+2],B[2*5+4]),
cuC_5pos(B[0*5+4],B[2*5+2],B[4*5+3],B[4*5+2],B[2*5+3]) ) ) ) );
cuDoubleComplex det43 = cuCmul(B[15],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[4*5+4],B[4*5+3],B[1*5+4]),
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[4*5+4],B[4*5+2],B[1*5+4]),
cuC_5pos(B[0*5+4],B[1*5+2],B[4*5+3],B[4*5+2],B[1*5+3]) ) ) ) );
cuDoubleComplex det44 = cuCmul(B[15],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[2*5+4],B[2*5+3],B[1*5+4]),
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[2*5+4],B[2*5+2],B[1*5+4]),
cuC_5pos(B[0*5+4],B[1*5+2],B[2*5+3],B[1*5+3],B[2*5+2]) ) ) ) );
det4 = cuCasas(det41, det42, det43, det44);
cuDoubleComplex det51 = cuCmul(B[20],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[2*5+3],B[3*5+4],B[2*5+4],B[3*5+3]),
cuCadd( cuC_5neg(B[1*5+3],B[2*5+2],B[3*5+4],B[3*5+2],B[2*5+4]),
cuC_5pos(B[1*5+4],B[2*5+2],B[3*5+3],B[3*5+2],B[2*5+3]) ) ) ) );
cuDoubleComplex det52 = cuCmul(B[20],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[2*5+3],B[3*5+4],B[2*5+4],B[3*5+3]),
cuCadd( cuC_5neg(B[0*5+3],B[2*5+2],B[3*5+4],B[3*5+2],B[2*5+4]),
cuC_5pos(B[0*5+4],B[2*5+2],B[3*5+3],B[3*5+2],B[2*5+3]) ) ) ) );
cuDoubleComplex det53 = cuCmul(B[20],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[3*5+4],B[3*5+3],B[1*5+4]),
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[3*5+4],B[3*5+2],B[1*5+4]),
cuC_5pos(B[0*5+4],B[1*5+2],B[3*5+3],B[3*5+2],B[1*5+3]) ) ) ) );
cuDoubleComplex det54 = cuCmul(B[20],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[2*5+4],B[2*5+3],B[1*5+4]),
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[2*5+4],B[2*5+2],B[1*5+4]),
cuC_5pos(B[0*5+4],B[1*5+2],B[2*5+3],B[1*5+3],B[2*5+2]) ) ) ) );
det5 = cuCasas(det51, det52, det53, det54);
det = cuCasasa(det1,det2,det3,det4,det5);
return det;
}
#define N 5
__device__ cuDoubleComplex det_4x4(cuDoubleComplex *matrix) {
cuDoubleComplex det = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det1 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det2 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det3 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det4 = make_cuDoubleComplex(0.0,0.0);
// Calculate the determinant using cofactor expansions
det1 = cuCmul(matrix[0],
cuCadd( cuCsub( cuCmul( matrix[5],
cuCsub( cuCmul(matrix[10],matrix[15]),
cuCmul(matrix[11],matrix[14]) ))
,cuCmul(matrix[6],
cuCsub( cuCmul(matrix[9],matrix[15]),
cuCmul(matrix[11],matrix[13]) ) ) )
,cuCmul(matrix[7],
cuCsub( cuCmul(matrix[9],matrix[14]),
cuCmul(matrix[10],matrix[13]) ) ) ) );
det2 = cuCmul(matrix[1],
cuCadd( cuCsub( cuCmul( matrix[4],
cuCsub( cuCmul(matrix[10],matrix[15]),
cuCmul(matrix[11],matrix[14]) ))
,cuCmul(matrix[6],
cuCsub( cuCmul(matrix[8],matrix[15]),
cuCmul(matrix[11],matrix[12]) ) ) )
,cuCmul(matrix[7],
cuCsub( cuCmul(matrix[8],matrix[14]),
cuCmul(matrix[10],matrix[12]) ) ) ) );
det3 = cuCmul(matrix[2],
cuCadd( cuCsub( cuCmul( matrix[4],
cuCsub( cuCmul(matrix[9],matrix[15]),
cuCmul(matrix[11],matrix[13]) ))
,cuCmul(matrix[5],
cuCsub( cuCmul(matrix[8],matrix[15]),
cuCmul(matrix[11],matrix[12]) ) ) )
,cuCmul(matrix[7],
cuCsub( cuCmul(matrix[8],matrix[13]),
cuCmul(matrix[9],matrix[12]) ) ) ) );
det4 = cuCmul(matrix[3],
cuCadd( cuCsub( cuCmul( matrix[4],
cuCsub( cuCmul(matrix[9],matrix[14]),
cuCmul(matrix[10],matrix[13]) ))
,cuCmul(matrix[5],
cuCsub( cuCmul(matrix[8],matrix[14]),
cuCmul(matrix[10],matrix[12]) ) ) )
,cuCmul(matrix[6],
cuCsub( cuCmul(matrix[8],matrix[13]),
cuCmul(matrix[9],matrix[12]) ) ) ) );
det = cuCsub( cuCadd(det1,det3), cuCadd(det2,det4) );
return det;
}
__device__ cuDoubleComplex det_5x5(cuDoubleComplex *matrix) {
cuDoubleComplex det = make_cuDoubleComplex(0.0,0.0);
// Calculate the determinant using cofactor expansions
for (int col = 0; col < N; col++) {
cuDoubleComplex submatrix[N-1][N-1];
int subrow = 0, subcol = 0;
// Create the submatrix by excluding the current row and column
for (int row = 1; row < N; row++) {
for (int c = 0; c < N; c++) {
if (c == col) {
continue;
}
submatrix[subrow][subcol] = matrix[row * N + c];
subcol++;
}
subrow++;
subcol = 0;
}
// Calculate the cofactor
cuDoubleComplex det_sign = make_cuDoubleComplex((col % 2 == 0 ? 1 : -1),0.0);
cuDoubleComplex cofactor = cuCmul( cuCmul(det_sign,matrix[col]), det_4x4(&submatrix[0][0]));
// Accumulate the cofactors to calculate the determinant
det = cuCadd(det, cofactor);
}
return det;
}
__device__ cuDoubleComplex gpu_device_kernel(int i, int method) {
// some random data, depending on `i`, but for simplification say constant data
cuDoubleComplex matrix[5*5] = {
make_cuDoubleComplex(2.0*(i/69120 * 1000), 2.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(7.0, 0.0),
make_cuDoubleComplex(4.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(8.0, 0.0),
make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(3.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0),
make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(2.0, 0.0),
make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(6.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 2.0)
};
cuDoubleComplex cf_sign[5]={make_cuDoubleComplex(1.0,0.0),
make_cuDoubleComplex(-1.0,0.0),
make_cuDoubleComplex(1.0,0.0),
make_cuDoubleComplex(-1.0,0.0),
make_cuDoubleComplex(1.0,0.0),
};
cuDoubleComplex det;
if (method == 0) {
det = direct_det5(matrix);
} else {
det = det_5x5(matrix);
}
return det;
}
__global__ void gpu_global_kernel(cuDoubleComplex *d_res, int method) {
// some other computations, irrelevant to current problem
uint64_t total = 69120 * 1000;// 10 * 1000 * 1000; // this `total` is what I am referring to thousands
cuDoubleComplex det_sum, det;
det_sum = make_cuDoubleComplex(0.0,0.0);
for (int i = 0; i < total; i++) {
det = gpu_device_kernel(i, method);
det_sum = cuCadd(det_sum, det);
}
det_sum = cuCmul(det_sum, make_cuDoubleComplex(1.0/total,0.0) );
d_res[0] = det_sum;
return;
}
int main(int argc, char *argv[]) {
if (argc != 2) {
printf("Usage: %s <num1> <num2>\n", argv[0]);
return 1;
}
int method;
method = atoi(argv[1]);
double complex *res;
res = (double complex*)malloc(sizeof(double complex));
memset(res, 0.0,sizeof(double complex));
cuDoubleComplex *d_res;
cudaMalloc(&d_res, sizeof(cuDoubleComplex) );
int *d_method;
cudaMalloc(&d_method, sizeof(int) );
cudaMemcpy(d_method, &method, sizeof(int), cudaMemcpyHostToDevice);
clock_t direct_start, direct_end;
direct_start = clock();
gpu_global_kernel <<<1,1>>>(d_res, *d_method);
cudaDeviceSynchronize();
direct_end = clock();
cudaMemcpy(res, d_res, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost );
printf("RESULT: %.16E + %.16E i\n", creal(res[0]), cimag(res[0]));
double direct_time = (double)((double)(direct_end - direct_start) / CLOCKS_PER_SEC);
if (method == 0) {
printf("DIRECT EXEC: %.5f s \n", direct_time);
} else {
printf("COFACTOR EXEC: %.5f s \n", direct_time);
}
cudaDeviceReset();
return 0;
}
However, I am wondering if these implementation is even efficient.
I am looking for an alternative method to compute the determinant of a general matrix inside a device function. The example I have provided is one of the most straightforward methods, but an alternative approach, such as LU/Cholesky decomposition, that can be called from the device function is what I am seeking. Unfortunately, packages like LAPACK are not available for CUDA, but maybe a similar compatible function from some other packages could work within the CUDA device environment.
Any suggestions or function examples would be greatly appreciated!
答案1
得分: 1
由于您包含了一个C++头文件(<cstring>)
,我假设这是一段C++代码,尽管它看起来非常像C语言。
我直接采用您提供的用于扩展的生成代码,通过简单地定义cuDoubleComplex
的*
、+
和-
运算符来实现它。我在下面的代码中将此方法称为“generated”。此外,我对您的代码进行了一些轻微的修改。我添加了一个程序参数来指定GPU线程的数量,以确保GPU的充分利用。我还为每种方法使用了单独的内核。方法0是原始方法,方法1是您的直接方法,方法2是我的“generated”方法。
我使用以下命令编译了该代码:nvcc -std=c++14 -O3 -arch=sm_70 main.cu -o main
,并在V100 Pcie上针对32000个GPU线程看到以下时间。
配置 | 时间 |
---|---|
原始 | 145.01487秒 |
直接 | 67.07063秒 |
生成 | 24.02771秒 |
以下是代码:
#include <stdio.h>
#include <complex.h>
#include <cstring>
#include <cuda.h>
#include <cuComplex.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define N 5
// ... 其余代码 ...
请注意,我仅返回代码的部分翻译。如果您需要更多信息或有其他问题,请随时提出。
英文:
Since you are including a C++ header (<cstring>)
I am assuming its a C++ code, even though it looks very C-like.
I took the general generated code that you provided for expansion and implemented it directly as is by simply defining the operators *,+,-
for the cuDoubleComplex
.
I call this method "generated" in the code below. Additionally, I made some slight modifications to your code. I added a program parameter to specify the number of GPU threads to have a proper GPU utilization. I also use separate kernels for each method. Method 0 is the original method, method 1 is your direct method, method 2 is my "generated" method.
I compiled the code with nvcc -std=c++14 -O3 -arch=sm_70 main.cu -o main
and see the following timings on V100 Pcie for 32000 GPU threads.
Configuration | Timing |
---|---|
original | 145.01487 s |
direct | 67.07063 s |
generated | 24.02771 s |
The code:
#include <stdio.h>
#include <complex.h>
#include <cstring>
#include <cuda.h>
#include <cuComplex.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define N 5
__host__ __device__
bool operator==(cuDoubleComplex l, cuDoubleComplex r){
return cuCreal(l) == cuCreal(r) && cuCimag(l) == cuCimag(r);
}
__host__ __device__
cuDoubleComplex operator*(cuDoubleComplex l, cuDoubleComplex r){
return cuCmul(l,r);
}
__host__ __device__
cuDoubleComplex operator+(cuDoubleComplex l, cuDoubleComplex r){
return cuCadd(l,r);
}
__host__ __device__
cuDoubleComplex operator-(cuDoubleComplex l, cuDoubleComplex r){
return cuCsub(l,r);
}
__host__ __device__
cuDoubleComplex operator-(cuDoubleComplex l){
return make_cuDoubleComplex(-cuCreal(l), -cuCimag(l));
}
template<class Matrix>
__device__
cuDoubleComplex generated_det_5x5(Matrix B){
return
B[0*5+0]*(B[1*5+1]*(B[2*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[2*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[2*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))-B[2*5+1]*(B[1*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[1*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[1*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))+B[3*5+1]*(B[1*5+2]*(B[2*5+3]*B[4*5+4]-B[4*5+3]*B[2*5+4])-B[1*5+3]*(B[2*5+2]*B[4*5+4]-B[4*5+2]*B[2*5+4])+B[1*5+4]*(B[2*5+2]*B[4*5+3]-B[4*5+2]*B[2*5+3]))-B[4*5+1]*(B[1*5+2]*(B[2*5+3]*B[3*5+4]-B[3*5+3]*B[2*5+4])-B[1*5+3]*(B[2*5+2]*B[3*5+4]-B[3*5+2]*B[2*5+4])+B[1*5+4]*(B[2*5+2]*B[3*5+3]-B[2*5+3]*B[3*5+2])))
- B[1*5+0]*(B[0*5+1]*(B[2*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[2*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[2*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))-B[2*5+1]*(B[0*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[0*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[0*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))+B[3*5+1]*(B[0*5+2]*(B[2*5+3]*B[4*5+4]-B[4*5+3]*B[2*5+4])-B[0*5+3]*(B[2*5+2]*B[4*5+4]-B[4*5+2]*B[2*5+4])+B[0*5+4]*(B[2*5+2]*B[4*5+3]-B[4*5+2]*B[2*5+3]))-B[4*5+1]*(B[0*5+2]*(B[2*5+3]*B[3*5+4]-B[3*5+3]*B[2*5+4])-B[0*5+3]*(B[2*5+2]*B[3*5+4]-B[3*5+2]*B[2*5+4])+B[0*5+4]*(B[2*5+2]*B[3*5+3]-B[2*5+3]*B[3*5+2])))
+ B[2*5+0]*(B[0*5+1]*(B[1*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[1*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[1*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))-B[1*5+1]*(B[0*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[0*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[0*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))+B[3*5+1]*(B[0*5+2]*(B[1*5+3]*B[4*5+4]-B[4*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[4*5+4]-B[4*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[4*5+3]-B[4*5+2]*B[1*5+3]))-B[4*5+1]*(B[0*5+2]*(B[1*5+3]*B[3*5+4]-B[3*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[3*5+4]-B[3*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[3*5+3]-B[1*5+3]*B[3*5+2])))
- B[3*5+0]*(B[0*5+1]*(B[1*5+2]*(B[2*5+3]*B[4*5+4]-B[2*5+4]*B[4*5+3])-B[1*5+3]*(B[2*5+2]*B[4*5+4]-B[4*5+2]*B[2*5+4])+B[1*5+4]*(B[2*5+2]*B[4*5+3]-B[4*5+2]*B[2*5+3]))-B[1*5+1]*(B[0*5+2]*(B[2*5+3]*B[4*5+4]-B[2*5+4]*B[4*5+3])-B[0*5+3]*(B[2*5+2]*B[4*5+4]-B[4*5+2]*B[2*5+4])+B[0*5+4]*(B[2*5+2]*B[4*5+3]-B[4*5+2]*B[2*5+3]))+B[2*5+1]*(B[0*5+2]*(B[1*5+3]*B[4*5+4]-B[4*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[4*5+4]-B[4*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[4*5+3]-B[4*5+2]*B[1*5+3]))-B[4*5+1]*(B[0*5+2]*(B[1*5+3]*B[2*5+4]-B[2*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[2*5+4]-B[2*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[2*5+3]-B[1*5+3]*B[2*5+2])))
+ B[4*5+0]*(B[0*5+1]*(B[1*5+2]*(B[2*5+3]*B[3*5+4]-B[2*5+4]*B[3*5+3])-B[1*5+3]*(B[2*5+2]*B[3*5+4]-B[3*5+2]*B[2*5+4])+B[1*5+4]*(B[2*5+2]*B[3*5+3]-B[3*5+2]*B[2*5+3]))-B[1*5+1]*(B[0*5+2]*(B[2*5+3]*B[3*5+4]-B[2*5+4]*B[3*5+3])-B[0*5+3]*(B[2*5+2]*B[3*5+4]-B[3*5+2]*B[2*5+4])+B[0*5+4]*(B[2*5+2]*B[3*5+3]-B[3*5+2]*B[2*5+3]))+B[2*5+1]*(B[0*5+2]*(B[1*5+3]*B[3*5+4]-B[3*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[3*5+4]-B[3*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[3*5+3]-B[3*5+2]*B[1*5+3]))-B[3*5+1]*(B[0*5+2]*(B[1*5+3]*B[2*5+4]-B[2*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[2*5+4]-B[2*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[2*5+3]-B[1*5+3]*B[2*5+2])));
}
__host__ __device__ static __inline__ cuDoubleComplex cuCfms( cuDoubleComplex x,
cuDoubleComplex y,
cuDoubleComplex d){
double real_res;
double imag_res;
real_res = (cuCreal(x) * cuCreal(y)) - cuCreal(d);
imag_res = (cuCreal(x) * cuCimag(y)) - cuCimag(d);
real_res = -(cuCimag(x) * cuCimag(y)) + real_res;
imag_res = (cuCimag(x) * cuCreal(y)) + imag_res;
return make_cuDoubleComplex(real_res, imag_res);
}
__host__ __device__ static __inline__ cuDoubleComplex cuCaaaa(cuDoubleComplex x,
cuDoubleComplex y,
cuDoubleComplex u,
cuDoubleComplex v) {
return make_cuDoubleComplex (cuCreal(x) + cuCreal(y) + cuCreal(u) + cuCreal(v),
cuCimag(x) + cuCimag(y) + cuCimag(u) + cuCimag(v));
}
__host__ __device__ static __inline__ cuDoubleComplex cuCasas(cuDoubleComplex x,
cuDoubleComplex y,
cuDoubleComplex u,
cuDoubleComplex v) {
return make_cuDoubleComplex (cuCreal(x) - cuCreal(y) + cuCreal(u) - cuCreal(v),
cuCimag(x) - cuCimag(y) + cuCimag(u) - cuCimag(v));
}
__host__ __device__ static __inline__ cuDoubleComplex cuCasasa(cuDoubleComplex x,
cuDoubleComplex y,
cuDoubleComplex u,
cuDoubleComplex v,
cuDoubleComplex w) {
return make_cuDoubleComplex (cuCreal(x) - cuCreal(y) + cuCreal(u) - cuCreal(v) + cuCreal(w),
cuCimag(x) - cuCimag(y) + cuCimag(u) - cuCimag(v) + cuCimag(w) );
}
__device__ cuDoubleComplex cuC_5pos (cuDoubleComplex a, cuDoubleComplex b,
cuDoubleComplex c, cuDoubleComplex d,
cuDoubleComplex e) {
// B[12]*(B[18]*B[24] - B[19]*B[23])
//a*b*c -d*e
return cuCmul(a, cuCfms(b,c, cuCmul(d,e)));
}
__device__ cuDoubleComplex cuC_5neg (cuDoubleComplex a, cuDoubleComplex b,
cuDoubleComplex c, cuDoubleComplex d,
cuDoubleComplex e) {
// B[12]*(B[18]*B[24] - B[19]*B[23])
//a*b*c -d*e
return cuCmul(cuCmul(make_cuDoubleComplex(-1.0,0.0),a), cuCfms(b,c, cuCmul(d,e)));
}
__device__ cuDoubleComplex direct_det5 (cuDoubleComplex *B){
cuDoubleComplex det;
cuDoubleComplex det1 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det2 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det3 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det4 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det5 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det11 = cuCmul(B[0],cuCmul(B[6],cuCadd( cuC_5pos(B[12],B[18],B[24],B[19],B[23]),
cuCadd( cuC_5neg(B[13],B[17],B[24],B[22],B[19]),
cuC_5pos(B[14],B[17],B[23],B[22],B[18]) ) ) ) );
cuDoubleComplex det12 = cuCmul(B[0],cuCmul(B[11],cuCadd(cuC_5pos(B[7],B[18],B[24],B[19],B[23]),
cuCadd( cuC_5neg(B[8],B[17],B[24],B[22],B[19]),
cuC_5pos(B[9],B[17],B[23],B[22],B[18]) ) ) ) );
cuDoubleComplex det13 = cuCmul(B[0],cuCmul(B[16],cuCadd(cuC_5pos(B[7],B[13],B[24],B[23],B[14]),
cuCadd( cuC_5neg(B[8],B[12],B[24],B[22],B[14]),
cuC_5pos(B[9],B[12],B[23],B[22],B[13]) ) ) ) );
cuDoubleComplex det14 = cuCmul(B[0],cuCmul(B[21],cuCadd(cuC_5pos(B[7],B[13],B[19],B[18],B[14]),
cuCadd( cuC_5neg(B[8],B[12],B[19],B[17],B[14]),
cuC_5pos(B[9],B[12],B[18],B[13],B[17]) ) ) ) );
det1 = cuCasas(det11, det12, det13, det14);
cuDoubleComplex det21 = cuCmul(B[1*5+0],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[12],B[18],B[24],B[19],B[23]), cuCadd( cuC_5neg(B[13],B[17],B[24],B[22],B[19]), cuC_5pos(B[14],B[17],B[23],B[22],B[18]) ) ) ) );
cuDoubleComplex det22 = cuCmul(B[1*5+0],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[2],B[18],B[24],B[19],B[23]), cuCadd( cuC_5neg(B[3],B[17],B[24],B[22],B[19]), cuC_5pos(B[4],B[17],B[23],B[22],B[18]) ) ) ) );
cuDoubleComplex det23 = cuCmul(B[1*5+0],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[2],B[13],B[24],B[23],B[14]), cuCadd( cuC_5neg(B[3],B[12],B[24],B[22],B[14]), cuC_5pos(B[4],B[12],B[23],B[22],B[13]) ) ) ) );
cuDoubleComplex det24 = cuCmul(B[1*5+0],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[2],B[13],B[19],B[18],B[14]), cuCadd( cuC_5neg(B[3],B[12],B[19],B[17],B[14]), cuC_5pos(B[4],B[12],B[18],B[13],B[17]) ) ) ) );
det2 = cuCasas(det21, det22, det23, det24);
cuDoubleComplex det31 = cuCmul(B[10],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[3*5+3],B[4*5+4],B[3*5+4],B[4*5+3]),
cuCadd( cuC_5neg(B[1*5+3],B[3*5+2],B[4*5+4],B[4*5+2],B[3*5+4]),
cuC_5pos(B[1*5+4],B[3*5+2],B[4*5+3],B[4*5+2],B[3*5+3]) ) ) ) );
cuDoubleComplex det32 = cuCmul(B[10],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[3*5+3],B[4*5+4],B[3*5+4],B[4*5+3]),
cuCadd( cuC_5neg(B[0*5+3],B[3*5+2],B[4*5+4],B[4*5+2],B[3*5+4]),
cuC_5pos(B[0*5+4],B[3*5+2],B[4*5+3],B[4*5+2],B[3*5+3]) ) ) ) );
cuDoubleComplex det33 = cuCmul(B[10],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[4*5+4],B[4*5+3],B[1*5+4]),
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[4*5+4],B[4*5+2],B[1*5+4]),
cuC_5pos(B[0*5+4],B[1*5+2],B[4*5+3],B[4*5+2],B[1*5+3]) ) ) ) );
cuDoubleComplex det34 = cuCmul(B[10],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[3*5+4],B[3*5+3],B[1*5+4]),
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[3*5+4],B[3*5+2],B[1*5+4]),
cuC_5pos(B[0*5+4],B[1*5+2],B[3*5+3],B[1*5+3],B[3*5+2]) ) ) ) );
det3 = cuCasas(det31, det32, det33, det34);
cuDoubleComplex det41 = cuCmul(B[15],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[2*5+3],B[4*5+4],B[2*5+4],B[4*5+3]),
cuCadd( cuC_5neg(B[1*5+3],B[2*5+2],B[4*5+4],B[4*5+2],B[2*5+4]),
cuC_5pos(B[1*5+4],B[2*5+2],B[4*5+3],B[4*5+2],B[2*5+3]) ) ) ) );
cuDoubleComplex det42 = cuCmul(B[15],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[2*5+3],B[4*5+4],B[2*5+4],B[4*5+3]),
cuCadd( cuC_5neg(B[0*5+3],B[2*5+2],B[4*5+4],B[4*5+2],B[2*5+4]),
cuC_5pos(B[0*5+4],B[2*5+2],B[4*5+3],B[4*5+2],B[2*5+3]) ) ) ) );
cuDoubleComplex det43 = cuCmul(B[15],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[4*5+4],B[4*5+3],B[1*5+4]),
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[4*5+4],B[4*5+2],B[1*5+4]),
cuC_5pos(B[0*5+4],B[1*5+2],B[4*5+3],B[4*5+2],B[1*5+3]) ) ) ) );
cuDoubleComplex det44 = cuCmul(B[15],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[2*5+4],B[2*5+3],B[1*5+4]),
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[2*5+4],B[2*5+2],B[1*5+4]),
cuC_5pos(B[0*5+4],B[1*5+2],B[2*5+3],B[1*5+3],B[2*5+2]) ) ) ) );
det4 = cuCasas(det41, det42, det43, det44);
cuDoubleComplex det51 = cuCmul(B[20],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[2*5+3],B[3*5+4],B[2*5+4],B[3*5+3]),
cuCadd( cuC_5neg(B[1*5+3],B[2*5+2],B[3*5+4],B[3*5+2],B[2*5+4]),
cuC_5pos(B[1*5+4],B[2*5+2],B[3*5+3],B[3*5+2],B[2*5+3]) ) ) ) );
cuDoubleComplex det52 = cuCmul(B[20],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[2*5+3],B[3*5+4],B[2*5+4],B[3*5+3]),
cuCadd( cuC_5neg(B[0*5+3],B[2*5+2],B[3*5+4],B[3*5+2],B[2*5+4]),
cuC_5pos(B[0*5+4],B[2*5+2],B[3*5+3],B[3*5+2],B[2*5+3]) ) ) ) );
cuDoubleComplex det53 = cuCmul(B[20],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[3*5+4],B[3*5+3],B[1*5+4]),
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[3*5+4],B[3*5+2],B[1*5+4]),
cuC_5pos(B[0*5+4],B[1*5+2],B[3*5+3],B[3*5+2],B[1*5+3]) ) ) ) );
cuDoubleComplex det54 = cuCmul(B[20],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[2*5+4],B[2*5+3],B[1*5+4]),
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[2*5+4],B[2*5+2],B[1*5+4]),
cuC_5pos(B[0*5+4],B[1*5+2],B[2*5+3],B[1*5+3],B[2*5+2]) ) ) ) );
det5 = cuCasas(det51, det52, det53, det54);
det = cuCasasa(det1,det2,det3,det4,det5);
return det;
}
__device__ cuDoubleComplex det_4x4(cuDoubleComplex *matrix) {
cuDoubleComplex det = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det1 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det2 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det3 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det4 = make_cuDoubleComplex(0.0,0.0);
// Calculate the determinant using cofactor expansions
det1 = cuCmul(matrix[0],
cuCadd( cuCsub( cuCmul( matrix[5],
cuCsub( cuCmul(matrix[10],matrix[15]),
cuCmul(matrix[11],matrix[14]) ))
,cuCmul(matrix[6],
cuCsub( cuCmul(matrix[9],matrix[15]),
cuCmul(matrix[11],matrix[13]) ) ) )
,cuCmul(matrix[7],
cuCsub( cuCmul(matrix[9],matrix[14]),
cuCmul(matrix[10],matrix[13]) ) ) ) );
det2 = cuCmul(matrix[1],
cuCadd( cuCsub( cuCmul( matrix[4],
cuCsub( cuCmul(matrix[10],matrix[15]),
cuCmul(matrix[11],matrix[14]) ))
,cuCmul(matrix[6],
cuCsub( cuCmul(matrix[8],matrix[15]),
cuCmul(matrix[11],matrix[12]) ) ) )
,cuCmul(matrix[7],
cuCsub( cuCmul(matrix[8],matrix[14]),
cuCmul(matrix[10],matrix[12]) ) ) ) );
det3 = cuCmul(matrix[2],
cuCadd( cuCsub( cuCmul( matrix[4],
cuCsub( cuCmul(matrix[9],matrix[15]),
cuCmul(matrix[11],matrix[13]) ))
,cuCmul(matrix[5],
cuCsub( cuCmul(matrix[8],matrix[15]),
cuCmul(matrix[11],matrix[12]) ) ) )
,cuCmul(matrix[7],
cuCsub( cuCmul(matrix[8],matrix[13]),
cuCmul(matrix[9],matrix[12]) ) ) ) );
det4 = cuCmul(matrix[3],
cuCadd( cuCsub( cuCmul( matrix[4],
cuCsub( cuCmul(matrix[9],matrix[14]),
cuCmul(matrix[10],matrix[13]) ))
,cuCmul(matrix[5],
cuCsub( cuCmul(matrix[8],matrix[14]),
cuCmul(matrix[10],matrix[12]) ) ) )
,cuCmul(matrix[6],
cuCsub( cuCmul(matrix[8],matrix[13]),
cuCmul(matrix[9],matrix[12]) ) ) ) );
det = cuCsub( cuCadd(det1,det3), cuCadd(det2,det4) );
return det;
}
__device__ cuDoubleComplex det_5x5(cuDoubleComplex *matrix) {
cuDoubleComplex det = make_cuDoubleComplex(0.0,0.0);
// Calculate the determinant using cofactor expansions
#pragma unroll
for (int col = 0; col < N; col++) {
cuDoubleComplex submatrix[N-1][N-1];
int subrow = 0, subcol = 0;
// Create the submatrix by excluding the current row and column
#pragma unroll
for (int row = 1; row < N; row++) {
#pragma unroll
for (int c = 0; c < N; c++) {
if (c == col) {
continue;
}
submatrix[subrow][subcol] = matrix[row * N + c];
subcol++;
}
subrow++;
subcol = 0;
}
// Calculate the cofactor
cuDoubleComplex det_sign = make_cuDoubleComplex((col % 2 == 0 ? 1 : -1),0.0);
cuDoubleComplex cofactor = cuCmul( cuCmul(det_sign,matrix[col]), det_4x4(&submatrix[0][0]));
// Accumulate the cofactors to calculate the determinant
det = cuCadd(det, cofactor);
}
return det;
}
__device__ cuDoubleComplex gpu_device_kernel_original(int i) {
// some random data, depending on `i`, but for simplification say constant data
cuDoubleComplex matrix[5*5] = {
make_cuDoubleComplex(2.0*(i/69120 * 1000), 2.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(7.0, 0.0),
make_cuDoubleComplex(4.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(8.0, 0.0),
make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(3.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0),
make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(2.0, 0.0),
make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(6.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 2.0)
};
cuDoubleComplex det = det_5x5(matrix);
return det;
}
__device__ cuDoubleComplex gpu_device_kernel_direct(int i) {
// some random data, depending on `i`, but for simplification say constant data
cuDoubleComplex matrix[5*5] = {
make_cuDoubleComplex(2.0*(i/69120 * 1000), 2.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(7.0, 0.0),
make_cuDoubleComplex(4.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(8.0, 0.0),
make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(3.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0),
make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(2.0, 0.0),
make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(6.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 2.0)
};
cuDoubleComplex det = direct_det5(matrix);
return det;
}
__device__ cuDoubleComplex gpu_device_kernel_generated(int i) {
// some random data, depending on `i`, but for simplification say constant data
cuDoubleComplex matrix[5*5] = {
make_cuDoubleComplex(2.0*(i/69120 * 1000), 2.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(7.0, 0.0),
make_cuDoubleComplex(4.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(8.0, 0.0),
make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(3.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0),
make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(2.0, 0.0),
make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(6.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 2.0)
};
cuDoubleComplex det = generated_det_5x5(matrix);
return det;
}
__global__ void gpu_global_kernel_original(cuDoubleComplex *d_res, int numGpuThreads) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid < numGpuThreads){
// some other computations, irrelevant to current problem
uint64_t total = 69120 * 1000;// 10 * 1000 * 1000; // this `total` is what I am referring to thousands
cuDoubleComplex det_sum, det;
det_sum = make_cuDoubleComplex(0.0,0.0);
for (int i = 0; i < total; i++) {
det = gpu_device_kernel_original(i);
det_sum = cuCadd(det_sum, det);
}
det_sum = cuCmul(det_sum, make_cuDoubleComplex(1.0/total,0.0) );
d_res[tid] = det_sum;
}
}
__global__ void gpu_global_kernel_direct(cuDoubleComplex *d_res, int numGpuThreads) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid < numGpuThreads){
// some other computations, irrelevant to current problem
uint64_t total = 69120 * 1000;// 10 * 1000 * 1000; // this `total` is what I am referring to thousands
cuDoubleComplex det_sum, det;
det_sum = make_cuDoubleComplex(0.0,0.0);
for (int i = 0; i < total; i++) {
det = gpu_device_kernel_direct(i);
det_sum = cuCadd(det_sum, det);
}
det_sum = cuCmul(det_sum, make_cuDoubleComplex(1.0/total,0.0) );
d_res[tid] = det_sum;
}
}
__global__ void gpu_global_kernel_generated(cuDoubleComplex *d_res, int numGpuThreads) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid < numGpuThreads){
// some other computations, irrelevant to current problem
uint64_t total = 69120 * 1000;// 10 * 1000 * 1000; // this `total` is what I am referring to thousands
cuDoubleComplex det_sum, det;
det_sum = make_cuDoubleComplex(0.0,0.0);
for (int i = 0; i < total; i++) {
det = gpu_device_kernel_generated(i);
det_sum = cuCadd(det_sum, det);
}
det_sum = cuCmul(det_sum, make_cuDoubleComplex(1.0/total,0.0) );
d_res[tid] = det_sum;
}
}
int main(int argc, char *argv[]) {
if (argc < 2) {
printf("Usage: %s <method> [<numGpuThreads>]\n", argv[0]);
printf("method: 0 = original, 1 = direct, 2 = generated(no explicit cucomplex)\n");
return 1;
}
int method;
method = atoi(argv[1]);
int numGpuThreads = 1;
if(argc > 2){
numGpuThreads = atoi(argv[2]);
}
cuDoubleComplex *res;
res = (cuDoubleComplex*)malloc(sizeof(cuDoubleComplex) * numGpuThreads);
memset(res, 0.0,sizeof(cuDoubleComplex) * numGpuThreads);
cuDoubleComplex *d_res;
cudaMalloc(&d_res, sizeof(cuDoubleComplex) * numGpuThreads );
clock_t direct_start, direct_end;
direct_start = clock();
dim3 block = 128;
dim3 grid = (numGpuThreads + block.x -1) / block.x;
switch(method){
case 0: printf("original\n"); gpu_global_kernel_original<<<grid,block>>>(d_res, numGpuThreads); break;
case 1: printf("direct\n"); gpu_global_kernel_direct<<<grid,block>>>(d_res, numGpuThreads); break;
case 2: printf("generated\n"); gpu_global_kernel_generated<<<grid,block>>>(d_res, numGpuThreads); break;
}
cudaDeviceSynchronize();
direct_end = clock();
//just copy the first result, not all numGpuThreads results
cudaMemcpy(res, d_res, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost );
printf("RESULT: %.16E + %.16E i\n", cuCreal(res[0]), cuCimag(res[0]));
double timing = (double)((double)(direct_end - direct_start) / CLOCKS_PER_SEC);
printf("time %.5f s\n", timing);
cudaDeviceReset();
return 0;
}
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论