在 GPU 线程内计算小型复数矩阵的行列式。

huangapple go评论55阅读模式
英文:

Determinant of a small complex matrix within a GPU thread

问题

问题是计算复杂矩阵(5x5、4x4和3x3)的行列式,要求在特定线程内的设备函数中完成。效率至关重要,因为原问题涉及在每个线程的循环内计算数千个这样的矩阵。

这里是使用直接展开(参数为0运行)和余子式展开(参数为1运行)方法的工作示例:

#include <stdio.h>
#include <complex.h>
#include <cstring>

#include <cuda.h>
#include <cuComplex.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#define N 5

// ...(略过一部分代码)

__host__ __device__ static __inline__ cuDoubleComplex cuCfms(cuDoubleComplex x,
                                                             cuDoubleComplex y,
                                                             cuDoubleComplex d){
    // ...(略过一部分代码)
}

__host__ __device__ static __inline__ cuDoubleComplex cuCaaaa(cuDoubleComplex x,
                                                              cuDoubleComplex y,
                                                              cuDoubleComplex u,
                                                              cuDoubleComplex v) {
    // ...(略过一部分代码)
}

// ...(略过一部分代码)

__device__ cuDoubleComplex direct_det5 (cuDoubleComplex *B){
    // ...(略过一部分代码)
}

__device__ cuDoubleComplex cuC_5pos (cuDoubleComplex a, cuDoubleComplex b,
                                   cuDoubleComplex c, cuDoubleComplex d,
                                   cuDoubleComplex e) {
    // ...(略过一部分代码)
}

__device__ cuDoubleComplex cuC_5neg (cuDoubleComplex a, cuDoubleComplex b,
                                   cuDoubleComplex c, cuDoubleComplex d,
                                   cuDoubleComplex e) {
    // ...(略过一部分代码)
}

// ...(略过一部分代码)

__device__ cuDoubleComplex det_5x5(cuDoubleComplex *matrix) {
    // ...(略过一部分代码)
}

// ...(略过一部分代码)

__device__ cuDoubleComplex gpu_device_kernel(int i, int method) {
    // ...(略过一部分代码)
}

// ...(略过一部分代码)

__global__ void gpu_global_kernel(cuDoubleComplex *d_res, int method) {
    // ...(略过一部分代码)
}

int main(int argc, char *argv[]) {
    // ...(略过一部分代码)
}

然而,我在考虑这些实现是否真的高效。 我正在寻找一种替代方法,可以在设备函数中计算一般矩阵的行列式。 我提供的示例是最简单直接的方法之一,但我正在寻找另一种方法,比如LU/Cholesky分解,可以从设备函数中调用它。 不幸的是,诸如LAPACK之类的包对于CUDA不可用,但也许来自其他一些软件包的类似兼容函数可以在CUDA设备环境中使用。 对于可以在CUDA设备上使用的类似函数的任何建议或示例将不胜感激!

英文:

The problem is to compute the determinant of a complex matrix (5x5, 4x4, and 3x3) from a device function within a particular thread. Efficiency is crucial, as the original problem involves computing thousands of such matrices in a for loop within each thread.

Here is a working example using direct expansion (run with arg: 0) and cofactor expansion (run with arg: 1) methods:

#include <stdio.h>
#include <complex.h>
#include <cstring>
#include <cuda.h>
#include <cuComplex.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define N 5
__host__ __device__ static __inline__  cuDoubleComplex cuCfms( cuDoubleComplex x, 
cuDoubleComplex y, 
cuDoubleComplex d){
double real_res;
double imag_res;
real_res = (cuCreal(x) *  cuCreal(y)) - cuCreal(d);
imag_res = (cuCreal(x) *  cuCimag(y)) - cuCimag(d);
real_res = -(cuCimag(x) * cuCimag(y))  + real_res;  
imag_res =  (cuCimag(x) *  cuCreal(y)) + imag_res;     
return make_cuDoubleComplex(real_res, imag_res);
}
__host__ __device__ static __inline__ cuDoubleComplex cuCaaaa(cuDoubleComplex x,
cuDoubleComplex y,
cuDoubleComplex u,
cuDoubleComplex v) {
return make_cuDoubleComplex (cuCreal(x) + cuCreal(y) + cuCreal(u) + cuCreal(v), 
cuCimag(x) + cuCimag(y) + cuCimag(u) + cuCimag(v));
}
__host__ __device__ static __inline__ cuDoubleComplex cuCasas(cuDoubleComplex x,
cuDoubleComplex y,
cuDoubleComplex u,
cuDoubleComplex v) {
return make_cuDoubleComplex (cuCreal(x) - cuCreal(y) + cuCreal(u) - cuCreal(v), 
cuCimag(x) - cuCimag(y) + cuCimag(u) - cuCimag(v));
}
__host__ __device__ static __inline__ cuDoubleComplex cuCasasa(cuDoubleComplex x,
cuDoubleComplex y,
cuDoubleComplex u,
cuDoubleComplex v,
cuDoubleComplex w) {
return make_cuDoubleComplex (cuCreal(x) - cuCreal(y) + cuCreal(u) - cuCreal(v) + cuCreal(w), 
cuCimag(x) - cuCimag(y) + cuCimag(u) - cuCimag(v) + cuCimag(w) );
}
__device__ cuDoubleComplex cuC_5pos (cuDoubleComplex a, cuDoubleComplex b,
cuDoubleComplex c, cuDoubleComplex d,
cuDoubleComplex e) {
// B[12]*(B[18]*B[24] - B[19]*B[23])
//a*b*c -d*e
return cuCmul(a, cuCfms(b,c, cuCmul(d,e)));
}
__device__ cuDoubleComplex cuC_5neg (cuDoubleComplex a, cuDoubleComplex b,
cuDoubleComplex c, cuDoubleComplex d,
cuDoubleComplex e) {
// B[12]*(B[18]*B[24] - B[19]*B[23])
//a*b*c -d*e
return cuCmul(cuCmul(make_cuDoubleComplex(-1.0,0.0),a), cuCfms(b,c, cuCmul(d,e)));
}
__device__ cuDoubleComplex direct_det5 (cuDoubleComplex *B){
cuDoubleComplex det;
cuDoubleComplex det1 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det2 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det3 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det4 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det5 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det11 = cuCmul(B[0],cuCmul(B[6],cuCadd( cuC_5pos(B[12],B[18],B[24],B[19],B[23]), 
cuCadd(  cuC_5neg(B[13],B[17],B[24],B[22],B[19]), 
cuC_5pos(B[14],B[17],B[23],B[22],B[18]) ) ) ) );
cuDoubleComplex det12 = cuCmul(B[0],cuCmul(B[11],cuCadd(cuC_5pos(B[7],B[18],B[24],B[19],B[23]), 
cuCadd( cuC_5neg(B[8],B[17],B[24],B[22],B[19]), 
cuC_5pos(B[9],B[17],B[23],B[22],B[18]) ) ) ) );
cuDoubleComplex det13 = cuCmul(B[0],cuCmul(B[16],cuCadd(cuC_5pos(B[7],B[13],B[24],B[23],B[14]), 
cuCadd( cuC_5neg(B[8],B[12],B[24],B[22],B[14]), 
cuC_5pos(B[9],B[12],B[23],B[22],B[13]) ) ) ) );
cuDoubleComplex det14 = cuCmul(B[0],cuCmul(B[21],cuCadd(cuC_5pos(B[7],B[13],B[19],B[18],B[14]), 
cuCadd( cuC_5neg(B[8],B[12],B[19],B[17],B[14]), 
cuC_5pos(B[9],B[12],B[18],B[13],B[17]) ) ) ) );
det1 = cuCasas(det11, det12, det13, det14);
cuDoubleComplex det21 = cuCmul(B[1*5+0],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[12],B[18],B[24],B[19],B[23]), cuCadd( cuC_5neg(B[13],B[17],B[24],B[22],B[19]), cuC_5pos(B[14],B[17],B[23],B[22],B[18]) ) ) ) );
cuDoubleComplex det22 = cuCmul(B[1*5+0],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[2],B[18],B[24],B[19],B[23]), cuCadd( cuC_5neg(B[3],B[17],B[24],B[22],B[19]), cuC_5pos(B[4],B[17],B[23],B[22],B[18]) ) ) ) );
cuDoubleComplex det23 = cuCmul(B[1*5+0],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[2],B[13],B[24],B[23],B[14]), cuCadd( cuC_5neg(B[3],B[12],B[24],B[22],B[14]), cuC_5pos(B[4],B[12],B[23],B[22],B[13]) ) ) ) );
cuDoubleComplex det24 = cuCmul(B[1*5+0],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[2],B[13],B[19],B[18],B[14]), cuCadd( cuC_5neg(B[3],B[12],B[19],B[17],B[14]), cuC_5pos(B[4],B[12],B[18],B[13],B[17]) ) ) ) );
det2 = cuCasas(det21, det22, det23, det24);
cuDoubleComplex det31 = cuCmul(B[10],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[3*5+3],B[4*5+4],B[3*5+4],B[4*5+3]), 
cuCadd(  cuC_5neg(B[1*5+3],B[3*5+2],B[4*5+4],B[4*5+2],B[3*5+4]), 
cuC_5pos(B[1*5+4],B[3*5+2],B[4*5+3],B[4*5+2],B[3*5+3]) ) ) ) );
cuDoubleComplex det32 = cuCmul(B[10],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[3*5+3],B[4*5+4],B[3*5+4],B[4*5+3]), 
cuCadd( cuC_5neg(B[0*5+3],B[3*5+2],B[4*5+4],B[4*5+2],B[3*5+4]), 
cuC_5pos(B[0*5+4],B[3*5+2],B[4*5+3],B[4*5+2],B[3*5+3]) ) ) ) );
cuDoubleComplex det33 = cuCmul(B[10],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[4*5+4],B[4*5+3],B[1*5+4]), 
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[4*5+4],B[4*5+2],B[1*5+4]), 
cuC_5pos(B[0*5+4],B[1*5+2],B[4*5+3],B[4*5+2],B[1*5+3]) ) ) ) );
cuDoubleComplex det34 = cuCmul(B[10],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[3*5+4],B[3*5+3],B[1*5+4]), 
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[3*5+4],B[3*5+2],B[1*5+4]), 
cuC_5pos(B[0*5+4],B[1*5+2],B[3*5+3],B[1*5+3],B[3*5+2]) ) ) ) );
det3 = cuCasas(det31, det32, det33, det34);
cuDoubleComplex det41 = cuCmul(B[15],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[2*5+3],B[4*5+4],B[2*5+4],B[4*5+3]), 
cuCadd(  cuC_5neg(B[1*5+3],B[2*5+2],B[4*5+4],B[4*5+2],B[2*5+4]), 
cuC_5pos(B[1*5+4],B[2*5+2],B[4*5+3],B[4*5+2],B[2*5+3]) ) ) ) );
cuDoubleComplex det42 = cuCmul(B[15],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[2*5+3],B[4*5+4],B[2*5+4],B[4*5+3]), 
cuCadd( cuC_5neg(B[0*5+3],B[2*5+2],B[4*5+4],B[4*5+2],B[2*5+4]), 
cuC_5pos(B[0*5+4],B[2*5+2],B[4*5+3],B[4*5+2],B[2*5+3]) ) ) ) );
cuDoubleComplex det43 = cuCmul(B[15],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[4*5+4],B[4*5+3],B[1*5+4]), 
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[4*5+4],B[4*5+2],B[1*5+4]), 
cuC_5pos(B[0*5+4],B[1*5+2],B[4*5+3],B[4*5+2],B[1*5+3]) ) ) ) );
cuDoubleComplex det44 = cuCmul(B[15],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[2*5+4],B[2*5+3],B[1*5+4]), 
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[2*5+4],B[2*5+2],B[1*5+4]), 
cuC_5pos(B[0*5+4],B[1*5+2],B[2*5+3],B[1*5+3],B[2*5+2]) ) ) ) );
det4 = cuCasas(det41, det42, det43, det44);
cuDoubleComplex det51 = cuCmul(B[20],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[2*5+3],B[3*5+4],B[2*5+4],B[3*5+3]), 
cuCadd(  cuC_5neg(B[1*5+3],B[2*5+2],B[3*5+4],B[3*5+2],B[2*5+4]), 
cuC_5pos(B[1*5+4],B[2*5+2],B[3*5+3],B[3*5+2],B[2*5+3]) ) ) ) );
cuDoubleComplex det52 = cuCmul(B[20],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[2*5+3],B[3*5+4],B[2*5+4],B[3*5+3]), 
cuCadd( cuC_5neg(B[0*5+3],B[2*5+2],B[3*5+4],B[3*5+2],B[2*5+4]), 
cuC_5pos(B[0*5+4],B[2*5+2],B[3*5+3],B[3*5+2],B[2*5+3]) ) ) ) );
cuDoubleComplex det53 = cuCmul(B[20],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[3*5+4],B[3*5+3],B[1*5+4]), 
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[3*5+4],B[3*5+2],B[1*5+4]), 
cuC_5pos(B[0*5+4],B[1*5+2],B[3*5+3],B[3*5+2],B[1*5+3]) ) ) ) );
cuDoubleComplex det54 = cuCmul(B[20],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[2*5+4],B[2*5+3],B[1*5+4]), 
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[2*5+4],B[2*5+2],B[1*5+4]), 
cuC_5pos(B[0*5+4],B[1*5+2],B[2*5+3],B[1*5+3],B[2*5+2]) ) ) ) );
det5 = cuCasas(det51, det52, det53, det54);
det = cuCasasa(det1,det2,det3,det4,det5);
return det;
}
#define N 5
__device__ cuDoubleComplex det_4x4(cuDoubleComplex *matrix) {
cuDoubleComplex det = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det1 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det2 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det3 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det4 = make_cuDoubleComplex(0.0,0.0);
// Calculate the determinant using cofactor expansions
det1 = cuCmul(matrix[0], 
cuCadd( cuCsub( cuCmul( matrix[5], 
cuCsub( cuCmul(matrix[10],matrix[15]), 
cuCmul(matrix[11],matrix[14]) ))
,cuCmul(matrix[6], 
cuCsub( cuCmul(matrix[9],matrix[15]),
cuCmul(matrix[11],matrix[13]) ) ) )
,cuCmul(matrix[7], 
cuCsub( cuCmul(matrix[9],matrix[14]),
cuCmul(matrix[10],matrix[13]) ) ) ) );
det2 = cuCmul(matrix[1], 
cuCadd( cuCsub( cuCmul( matrix[4], 
cuCsub( cuCmul(matrix[10],matrix[15]), 
cuCmul(matrix[11],matrix[14]) ))
,cuCmul(matrix[6], 
cuCsub( cuCmul(matrix[8],matrix[15]),
cuCmul(matrix[11],matrix[12]) ) ) )
,cuCmul(matrix[7], 
cuCsub( cuCmul(matrix[8],matrix[14]),
cuCmul(matrix[10],matrix[12]) ) ) ) );
det3 = cuCmul(matrix[2], 
cuCadd( cuCsub( cuCmul( matrix[4], 
cuCsub( cuCmul(matrix[9],matrix[15]), 
cuCmul(matrix[11],matrix[13]) ))
,cuCmul(matrix[5], 
cuCsub( cuCmul(matrix[8],matrix[15]),
cuCmul(matrix[11],matrix[12]) ) ) )
,cuCmul(matrix[7], 
cuCsub( cuCmul(matrix[8],matrix[13]),
cuCmul(matrix[9],matrix[12]) ) ) ) );
det4 = cuCmul(matrix[3], 
cuCadd( cuCsub( cuCmul( matrix[4], 
cuCsub( cuCmul(matrix[9],matrix[14]), 
cuCmul(matrix[10],matrix[13]) ))
,cuCmul(matrix[5], 
cuCsub( cuCmul(matrix[8],matrix[14]),
cuCmul(matrix[10],matrix[12]) ) ) )
,cuCmul(matrix[6], 
cuCsub( cuCmul(matrix[8],matrix[13]),
cuCmul(matrix[9],matrix[12]) ) ) ) );
det = cuCsub( cuCadd(det1,det3), cuCadd(det2,det4) );
return det;
}
__device__ cuDoubleComplex det_5x5(cuDoubleComplex *matrix) {
cuDoubleComplex det = make_cuDoubleComplex(0.0,0.0);
// Calculate the determinant using cofactor expansions
for (int col = 0; col < N; col++) {
cuDoubleComplex submatrix[N-1][N-1];
int subrow = 0, subcol = 0;
// Create the submatrix by excluding the current row and column
for (int row = 1; row < N; row++) {
for (int c = 0; c < N; c++) {
if (c == col) {
continue;
}
submatrix[subrow][subcol] = matrix[row * N + c];
subcol++;
}
subrow++;
subcol = 0;
}
// Calculate the cofactor
cuDoubleComplex det_sign = make_cuDoubleComplex((col % 2 == 0 ? 1 : -1),0.0);
cuDoubleComplex cofactor = cuCmul( cuCmul(det_sign,matrix[col]), det_4x4(&submatrix[0][0]));
// Accumulate the cofactors to calculate the determinant
det = cuCadd(det, cofactor);
}
return det;
}
__device__ cuDoubleComplex gpu_device_kernel(int i, int method) {
// some random data, depending on `i`, but for simplification say constant data
cuDoubleComplex matrix[5*5] = {
make_cuDoubleComplex(2.0*(i/69120 * 1000), 2.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(7.0, 0.0),
make_cuDoubleComplex(4.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(8.0, 0.0),
make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(3.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0),
make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(2.0, 0.0),
make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(6.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 2.0)
};
cuDoubleComplex cf_sign[5]={make_cuDoubleComplex(1.0,0.0),
make_cuDoubleComplex(-1.0,0.0),
make_cuDoubleComplex(1.0,0.0),
make_cuDoubleComplex(-1.0,0.0),
make_cuDoubleComplex(1.0,0.0),
};
cuDoubleComplex det;
if (method == 0) {
det = direct_det5(matrix);    
} else {
det = det_5x5(matrix);
}
return det;
}
__global__ void gpu_global_kernel(cuDoubleComplex *d_res, int method) {
// some other computations, irrelevant to current problem
uint64_t total = 69120 * 1000;// 10 * 1000 * 1000; // this `total` is what I am referring to thousands 
cuDoubleComplex det_sum, det;
det_sum = make_cuDoubleComplex(0.0,0.0);
for (int i = 0; i < total; i++) {
det = gpu_device_kernel(i, method);
det_sum = cuCadd(det_sum, det);
}
det_sum = cuCmul(det_sum, make_cuDoubleComplex(1.0/total,0.0) );
d_res[0] = det_sum;
return;
}
int main(int argc, char *argv[]) {
if (argc != 2) {
printf("Usage: %s <num1> <num2>\n", argv[0]);
return 1;
}
int method;
method = atoi(argv[1]);
double complex *res;
res = (double complex*)malloc(sizeof(double complex));
memset(res, 0.0,sizeof(double complex));
cuDoubleComplex *d_res;
cudaMalloc(&d_res, sizeof(cuDoubleComplex) );
int *d_method;
cudaMalloc(&d_method, sizeof(int) );
cudaMemcpy(d_method, &method, sizeof(int), cudaMemcpyHostToDevice);
clock_t direct_start, direct_end;
direct_start = clock();
gpu_global_kernel <<<1,1>>>(d_res, *d_method);
cudaDeviceSynchronize();
direct_end = clock();
cudaMemcpy(res, d_res, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost );
printf("RESULT: %.16E + %.16E i\n", creal(res[0]), cimag(res[0]));
double direct_time = (double)((double)(direct_end - direct_start) / CLOCKS_PER_SEC);
if (method == 0) {
printf("DIRECT EXEC: %.5f s \n", direct_time);
} else {
printf("COFACTOR EXEC: %.5f s \n", direct_time);        
}    
cudaDeviceReset();
return 0;
}

However, I am wondering if these implementation is even efficient.

I am looking for an alternative method to compute the determinant of a general matrix inside a device function. The example I have provided is one of the most straightforward methods, but an alternative approach, such as LU/Cholesky decomposition, that can be called from the device function is what I am seeking. Unfortunately, packages like LAPACK are not available for CUDA, but maybe a similar compatible function from some other packages could work within the CUDA device environment.

Any suggestions or function examples would be greatly appreciated!

答案1

得分: 1

由于您包含了一个C++头文件(<cstring>),我假设这是一段C++代码,尽管它看起来非常像C语言。

我直接采用您提供的用于扩展的生成代码,通过简单地定义cuDoubleComplex*+-运算符来实现它。我在下面的代码中将此方法称为“generated”。此外,我对您的代码进行了一些轻微的修改。我添加了一个程序参数来指定GPU线程的数量,以确保GPU的充分利用。我还为每种方法使用了单独的内核。方法0是原始方法,方法1是您的直接方法,方法2是我的“generated”方法。

我使用以下命令编译了该代码:nvcc -std=c++14 -O3 -arch=sm_70 main.cu -o main,并在V100 Pcie上针对32000个GPU线程看到以下时间。

配置 时间
原始 145.01487秒
直接 67.07063秒
生成 24.02771秒

以下是代码:

#include <stdio.h>
#include <complex.h>
#include <cstring>

#include <cuda.h>
#include <cuComplex.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#define N 5

// ... 其余代码 ...

请注意,我仅返回代码的部分翻译。如果您需要更多信息或有其他问题,请随时提出。

英文:

Since you are including a C++ header (<cstring>) I am assuming its a C++ code, even though it looks very C-like.

I took the general generated code that you provided for expansion and implemented it directly as is by simply defining the operators *,+,- for the cuDoubleComplex .

I call this method "generated" in the code below. Additionally, I made some slight modifications to your code. I added a program parameter to specify the number of GPU threads to have a proper GPU utilization. I also use separate kernels for each method. Method 0 is the original method, method 1 is your direct method, method 2 is my "generated" method.

I compiled the code with nvcc -std=c++14 -O3 -arch=sm_70 main.cu -o main and see the following timings on V100 Pcie for 32000 GPU threads.

Configuration Timing
original 145.01487 s
direct 67.07063 s
generated 24.02771 s

The code:

#include <stdio.h>
#include <complex.h>
#include <cstring>

#include <cuda.h>
#include <cuComplex.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#define N 5


__host__ __device__
bool operator==(cuDoubleComplex l, cuDoubleComplex r){
    return cuCreal(l) == cuCreal(r) && cuCimag(l) == cuCimag(r);
}

__host__ __device__
cuDoubleComplex operator*(cuDoubleComplex l, cuDoubleComplex r){
    return cuCmul(l,r);
}

__host__ __device__
cuDoubleComplex operator+(cuDoubleComplex l, cuDoubleComplex r){
    return cuCadd(l,r);
}

__host__ __device__
cuDoubleComplex operator-(cuDoubleComplex l, cuDoubleComplex r){
    return cuCsub(l,r);
}

__host__ __device__
cuDoubleComplex operator-(cuDoubleComplex l){
    return make_cuDoubleComplex(-cuCreal(l), -cuCimag(l));
}


template<class Matrix>
__device__ 
cuDoubleComplex generated_det_5x5(Matrix B){
    return 
       B[0*5+0]*(B[1*5+1]*(B[2*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[2*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[2*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))-B[2*5+1]*(B[1*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[1*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[1*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))+B[3*5+1]*(B[1*5+2]*(B[2*5+3]*B[4*5+4]-B[4*5+3]*B[2*5+4])-B[1*5+3]*(B[2*5+2]*B[4*5+4]-B[4*5+2]*B[2*5+4])+B[1*5+4]*(B[2*5+2]*B[4*5+3]-B[4*5+2]*B[2*5+3]))-B[4*5+1]*(B[1*5+2]*(B[2*5+3]*B[3*5+4]-B[3*5+3]*B[2*5+4])-B[1*5+3]*(B[2*5+2]*B[3*5+4]-B[3*5+2]*B[2*5+4])+B[1*5+4]*(B[2*5+2]*B[3*5+3]-B[2*5+3]*B[3*5+2])))
     - B[1*5+0]*(B[0*5+1]*(B[2*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[2*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[2*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))-B[2*5+1]*(B[0*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[0*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[0*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))+B[3*5+1]*(B[0*5+2]*(B[2*5+3]*B[4*5+4]-B[4*5+3]*B[2*5+4])-B[0*5+3]*(B[2*5+2]*B[4*5+4]-B[4*5+2]*B[2*5+4])+B[0*5+4]*(B[2*5+2]*B[4*5+3]-B[4*5+2]*B[2*5+3]))-B[4*5+1]*(B[0*5+2]*(B[2*5+3]*B[3*5+4]-B[3*5+3]*B[2*5+4])-B[0*5+3]*(B[2*5+2]*B[3*5+4]-B[3*5+2]*B[2*5+4])+B[0*5+4]*(B[2*5+2]*B[3*5+3]-B[2*5+3]*B[3*5+2])))
     + B[2*5+0]*(B[0*5+1]*(B[1*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[1*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[1*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))-B[1*5+1]*(B[0*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[0*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[0*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))+B[3*5+1]*(B[0*5+2]*(B[1*5+3]*B[4*5+4]-B[4*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[4*5+4]-B[4*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[4*5+3]-B[4*5+2]*B[1*5+3]))-B[4*5+1]*(B[0*5+2]*(B[1*5+3]*B[3*5+4]-B[3*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[3*5+4]-B[3*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[3*5+3]-B[1*5+3]*B[3*5+2])))
     - B[3*5+0]*(B[0*5+1]*(B[1*5+2]*(B[2*5+3]*B[4*5+4]-B[2*5+4]*B[4*5+3])-B[1*5+3]*(B[2*5+2]*B[4*5+4]-B[4*5+2]*B[2*5+4])+B[1*5+4]*(B[2*5+2]*B[4*5+3]-B[4*5+2]*B[2*5+3]))-B[1*5+1]*(B[0*5+2]*(B[2*5+3]*B[4*5+4]-B[2*5+4]*B[4*5+3])-B[0*5+3]*(B[2*5+2]*B[4*5+4]-B[4*5+2]*B[2*5+4])+B[0*5+4]*(B[2*5+2]*B[4*5+3]-B[4*5+2]*B[2*5+3]))+B[2*5+1]*(B[0*5+2]*(B[1*5+3]*B[4*5+4]-B[4*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[4*5+4]-B[4*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[4*5+3]-B[4*5+2]*B[1*5+3]))-B[4*5+1]*(B[0*5+2]*(B[1*5+3]*B[2*5+4]-B[2*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[2*5+4]-B[2*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[2*5+3]-B[1*5+3]*B[2*5+2])))
     + B[4*5+0]*(B[0*5+1]*(B[1*5+2]*(B[2*5+3]*B[3*5+4]-B[2*5+4]*B[3*5+3])-B[1*5+3]*(B[2*5+2]*B[3*5+4]-B[3*5+2]*B[2*5+4])+B[1*5+4]*(B[2*5+2]*B[3*5+3]-B[3*5+2]*B[2*5+3]))-B[1*5+1]*(B[0*5+2]*(B[2*5+3]*B[3*5+4]-B[2*5+4]*B[3*5+3])-B[0*5+3]*(B[2*5+2]*B[3*5+4]-B[3*5+2]*B[2*5+4])+B[0*5+4]*(B[2*5+2]*B[3*5+3]-B[3*5+2]*B[2*5+3]))+B[2*5+1]*(B[0*5+2]*(B[1*5+3]*B[3*5+4]-B[3*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[3*5+4]-B[3*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[3*5+3]-B[3*5+2]*B[1*5+3]))-B[3*5+1]*(B[0*5+2]*(B[1*5+3]*B[2*5+4]-B[2*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[2*5+4]-B[2*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[2*5+3]-B[1*5+3]*B[2*5+2])));
}








__host__ __device__ static __inline__  cuDoubleComplex cuCfms( cuDoubleComplex x, 
                                                               cuDoubleComplex y, 
                                                               cuDoubleComplex d){
    double real_res;
    double imag_res;
    
    real_res = (cuCreal(x) *  cuCreal(y)) - cuCreal(d);
    imag_res = (cuCreal(x) *  cuCimag(y)) - cuCimag(d);
            
    real_res = -(cuCimag(x) * cuCimag(y))  + real_res;  
    imag_res =  (cuCimag(x) *  cuCreal(y)) + imag_res;     
     
    return make_cuDoubleComplex(real_res, imag_res);
}

__host__ __device__ static __inline__ cuDoubleComplex cuCaaaa(cuDoubleComplex x,
                                                              cuDoubleComplex y,
                                                              cuDoubleComplex u,
                                                              cuDoubleComplex v) {

    return make_cuDoubleComplex (cuCreal(x) + cuCreal(y) + cuCreal(u) + cuCreal(v), 
                                 cuCimag(x) + cuCimag(y) + cuCimag(u) + cuCimag(v));
}

__host__ __device__ static __inline__ cuDoubleComplex cuCasas(cuDoubleComplex x,
                                                              cuDoubleComplex y,
                                                              cuDoubleComplex u,
                                                              cuDoubleComplex v) {

    return make_cuDoubleComplex (cuCreal(x) - cuCreal(y) + cuCreal(u) - cuCreal(v), 
                                 cuCimag(x) - cuCimag(y) + cuCimag(u) - cuCimag(v));
}

__host__ __device__ static __inline__ cuDoubleComplex cuCasasa(cuDoubleComplex x,
                                                               cuDoubleComplex y,
                                                               cuDoubleComplex u,
                                                               cuDoubleComplex v,
                                                               cuDoubleComplex w) {

    return make_cuDoubleComplex (cuCreal(x) - cuCreal(y) + cuCreal(u) - cuCreal(v) + cuCreal(w), 
                                 cuCimag(x) - cuCimag(y) + cuCimag(u) - cuCimag(v) + cuCimag(w) );
}


__device__ cuDoubleComplex cuC_5pos (cuDoubleComplex a, cuDoubleComplex b,
                                   cuDoubleComplex c, cuDoubleComplex d,
                                   cuDoubleComplex e) {
     // B[12]*(B[18]*B[24] - B[19]*B[23])
    //a*b*c -d*e
    return cuCmul(a, cuCfms(b,c, cuCmul(d,e)));
}

__device__ cuDoubleComplex cuC_5neg (cuDoubleComplex a, cuDoubleComplex b,
                                   cuDoubleComplex c, cuDoubleComplex d,
                                   cuDoubleComplex e) {
     // B[12]*(B[18]*B[24] - B[19]*B[23])
    //a*b*c -d*e
    return cuCmul(cuCmul(make_cuDoubleComplex(-1.0,0.0),a), cuCfms(b,c, cuCmul(d,e)));
}

__device__ cuDoubleComplex direct_det5 (cuDoubleComplex *B){
    cuDoubleComplex det;

    cuDoubleComplex det1 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det2 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det3 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det4 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det5 = make_cuDoubleComplex(0.0,0.0);


    cuDoubleComplex det11 = cuCmul(B[0],cuCmul(B[6],cuCadd( cuC_5pos(B[12],B[18],B[24],B[19],B[23]), 
                                                   cuCadd(  cuC_5neg(B[13],B[17],B[24],B[22],B[19]), 
                                                            cuC_5pos(B[14],B[17],B[23],B[22],B[18]) ) ) ) );
    cuDoubleComplex det12 = cuCmul(B[0],cuCmul(B[11],cuCadd(cuC_5pos(B[7],B[18],B[24],B[19],B[23]), 
                                                    cuCadd( cuC_5neg(B[8],B[17],B[24],B[22],B[19]), 
                                                            cuC_5pos(B[9],B[17],B[23],B[22],B[18]) ) ) ) );
    cuDoubleComplex det13 = cuCmul(B[0],cuCmul(B[16],cuCadd(cuC_5pos(B[7],B[13],B[24],B[23],B[14]), 
                                                    cuCadd( cuC_5neg(B[8],B[12],B[24],B[22],B[14]), 
                                                            cuC_5pos(B[9],B[12],B[23],B[22],B[13]) ) ) ) );
    cuDoubleComplex det14 = cuCmul(B[0],cuCmul(B[21],cuCadd(cuC_5pos(B[7],B[13],B[19],B[18],B[14]), 
                                                    cuCadd( cuC_5neg(B[8],B[12],B[19],B[17],B[14]), 
                                                            cuC_5pos(B[9],B[12],B[18],B[13],B[17]) ) ) ) );

    det1 = cuCasas(det11, det12, det13, det14);


    cuDoubleComplex det21 = cuCmul(B[1*5+0],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[12],B[18],B[24],B[19],B[23]), cuCadd( cuC_5neg(B[13],B[17],B[24],B[22],B[19]), cuC_5pos(B[14],B[17],B[23],B[22],B[18]) ) ) ) );
    cuDoubleComplex det22 = cuCmul(B[1*5+0],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[2],B[18],B[24],B[19],B[23]), cuCadd( cuC_5neg(B[3],B[17],B[24],B[22],B[19]), cuC_5pos(B[4],B[17],B[23],B[22],B[18]) ) ) ) );
    cuDoubleComplex det23 = cuCmul(B[1*5+0],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[2],B[13],B[24],B[23],B[14]), cuCadd( cuC_5neg(B[3],B[12],B[24],B[22],B[14]), cuC_5pos(B[4],B[12],B[23],B[22],B[13]) ) ) ) );
    cuDoubleComplex det24 = cuCmul(B[1*5+0],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[2],B[13],B[19],B[18],B[14]), cuCadd( cuC_5neg(B[3],B[12],B[19],B[17],B[14]), cuC_5pos(B[4],B[12],B[18],B[13],B[17]) ) ) ) );

    det2 = cuCasas(det21, det22, det23, det24);


    cuDoubleComplex det31 = cuCmul(B[10],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[3*5+3],B[4*5+4],B[3*5+4],B[4*5+3]), 
                                                       cuCadd(  cuC_5neg(B[1*5+3],B[3*5+2],B[4*5+4],B[4*5+2],B[3*5+4]), 
                                                                cuC_5pos(B[1*5+4],B[3*5+2],B[4*5+3],B[4*5+2],B[3*5+3]) ) ) ) );
    cuDoubleComplex det32 = cuCmul(B[10],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[3*5+3],B[4*5+4],B[3*5+4],B[4*5+3]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[3*5+2],B[4*5+4],B[4*5+2],B[3*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[3*5+2],B[4*5+3],B[4*5+2],B[3*5+3]) ) ) ) );
    cuDoubleComplex det33 = cuCmul(B[10],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[4*5+4],B[4*5+3],B[1*5+4]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[4*5+4],B[4*5+2],B[1*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[1*5+2],B[4*5+3],B[4*5+2],B[1*5+3]) ) ) ) );
    cuDoubleComplex det34 = cuCmul(B[10],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[3*5+4],B[3*5+3],B[1*5+4]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[3*5+4],B[3*5+2],B[1*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[1*5+2],B[3*5+3],B[1*5+3],B[3*5+2]) ) ) ) );


    det3 = cuCasas(det31, det32, det33, det34);


    cuDoubleComplex det41 = cuCmul(B[15],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[2*5+3],B[4*5+4],B[2*5+4],B[4*5+3]), 
                                                       cuCadd(  cuC_5neg(B[1*5+3],B[2*5+2],B[4*5+4],B[4*5+2],B[2*5+4]), 
                                                                cuC_5pos(B[1*5+4],B[2*5+2],B[4*5+3],B[4*5+2],B[2*5+3]) ) ) ) );
    cuDoubleComplex det42 = cuCmul(B[15],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[2*5+3],B[4*5+4],B[2*5+4],B[4*5+3]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[2*5+2],B[4*5+4],B[4*5+2],B[2*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[2*5+2],B[4*5+3],B[4*5+2],B[2*5+3]) ) ) ) );
    cuDoubleComplex det43 = cuCmul(B[15],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[4*5+4],B[4*5+3],B[1*5+4]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[4*5+4],B[4*5+2],B[1*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[1*5+2],B[4*5+3],B[4*5+2],B[1*5+3]) ) ) ) );
    cuDoubleComplex det44 = cuCmul(B[15],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[2*5+4],B[2*5+3],B[1*5+4]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[2*5+4],B[2*5+2],B[1*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[1*5+2],B[2*5+3],B[1*5+3],B[2*5+2]) ) ) ) );


    det4 = cuCasas(det41, det42, det43, det44);


    cuDoubleComplex det51 = cuCmul(B[20],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[2*5+3],B[3*5+4],B[2*5+4],B[3*5+3]), 
                                                       cuCadd(  cuC_5neg(B[1*5+3],B[2*5+2],B[3*5+4],B[3*5+2],B[2*5+4]), 
                                                                cuC_5pos(B[1*5+4],B[2*5+2],B[3*5+3],B[3*5+2],B[2*5+3]) ) ) ) );
    cuDoubleComplex det52 = cuCmul(B[20],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[2*5+3],B[3*5+4],B[2*5+4],B[3*5+3]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[2*5+2],B[3*5+4],B[3*5+2],B[2*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[2*5+2],B[3*5+3],B[3*5+2],B[2*5+3]) ) ) ) );
    cuDoubleComplex det53 = cuCmul(B[20],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[3*5+4],B[3*5+3],B[1*5+4]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[3*5+4],B[3*5+2],B[1*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[1*5+2],B[3*5+3],B[3*5+2],B[1*5+3]) ) ) ) );
    cuDoubleComplex det54 = cuCmul(B[20],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[2*5+4],B[2*5+3],B[1*5+4]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[2*5+4],B[2*5+2],B[1*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[1*5+2],B[2*5+3],B[1*5+3],B[2*5+2]) ) ) ) );


    det5 = cuCasas(det51, det52, det53, det54);

    det = cuCasasa(det1,det2,det3,det4,det5);

    return det;
}



__device__ cuDoubleComplex det_4x4(cuDoubleComplex *matrix) {
    cuDoubleComplex det = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det1 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det2 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det3 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det4 = make_cuDoubleComplex(0.0,0.0);

    // Calculate the determinant using cofactor expansions
    det1 = cuCmul(matrix[0], 
                  cuCadd( cuCsub( cuCmul( matrix[5], 
                                          cuCsub( cuCmul(matrix[10],matrix[15]), 
                                                  cuCmul(matrix[11],matrix[14]) ))
                                 ,cuCmul(matrix[6], 
                                         cuCsub( cuCmul(matrix[9],matrix[15]),
                                                 cuCmul(matrix[11],matrix[13]) ) ) )
                         ,cuCmul(matrix[7], 
                                 cuCsub( cuCmul(matrix[9],matrix[14]),
                                         cuCmul(matrix[10],matrix[13]) ) ) ) );



    det2 = cuCmul(matrix[1], 
                  cuCadd( cuCsub( cuCmul( matrix[4], 
                                          cuCsub( cuCmul(matrix[10],matrix[15]), 
                                                  cuCmul(matrix[11],matrix[14]) ))
                                 ,cuCmul(matrix[6], 
                                         cuCsub( cuCmul(matrix[8],matrix[15]),
                                                 cuCmul(matrix[11],matrix[12]) ) ) )
                         ,cuCmul(matrix[7], 
                                 cuCsub( cuCmul(matrix[8],matrix[14]),
                                         cuCmul(matrix[10],matrix[12]) ) ) ) );


    det3 = cuCmul(matrix[2], 
                  cuCadd( cuCsub( cuCmul( matrix[4], 
                                          cuCsub( cuCmul(matrix[9],matrix[15]), 
                                                  cuCmul(matrix[11],matrix[13]) ))
                                 ,cuCmul(matrix[5], 
                                         cuCsub( cuCmul(matrix[8],matrix[15]),
                                                 cuCmul(matrix[11],matrix[12]) ) ) )
                         ,cuCmul(matrix[7], 
                                 cuCsub( cuCmul(matrix[8],matrix[13]),
                                         cuCmul(matrix[9],matrix[12]) ) ) ) );
    
    
    det4 = cuCmul(matrix[3], 
                  cuCadd( cuCsub( cuCmul( matrix[4], 
                                          cuCsub( cuCmul(matrix[9],matrix[14]), 
                                                  cuCmul(matrix[10],matrix[13]) ))
                                 ,cuCmul(matrix[5], 
                                         cuCsub( cuCmul(matrix[8],matrix[14]),
                                                 cuCmul(matrix[10],matrix[12]) ) ) )
                         ,cuCmul(matrix[6], 
                                 cuCsub( cuCmul(matrix[8],matrix[13]),
                                         cuCmul(matrix[9],matrix[12]) ) ) ) );
    
    det = cuCsub( cuCadd(det1,det3), cuCadd(det2,det4) );

    return det;
}


__device__ cuDoubleComplex det_5x5(cuDoubleComplex *matrix) {
    cuDoubleComplex det = make_cuDoubleComplex(0.0,0.0);


    // Calculate the determinant using cofactor expansions
    #pragma unroll
    for (int col = 0; col < N; col++) {
        cuDoubleComplex submatrix[N-1][N-1];
        int subrow = 0, subcol = 0;

        // Create the submatrix by excluding the current row and column
        #pragma unroll
        for (int row = 1; row < N; row++) {
            #pragma unroll
            for (int c = 0; c < N; c++) {
                if (c == col) {
                    continue;
                }
                submatrix[subrow][subcol] = matrix[row * N + c];
                subcol++;
            }
            subrow++;
            subcol = 0;
        }

        // Calculate the cofactor
        cuDoubleComplex det_sign = make_cuDoubleComplex((col % 2 == 0 ? 1 : -1),0.0);
        cuDoubleComplex cofactor = cuCmul( cuCmul(det_sign,matrix[col]), det_4x4(&submatrix[0][0]));


        // Accumulate the cofactors to calculate the determinant
        det = cuCadd(det, cofactor);
    }


    return det;
}

__device__ cuDoubleComplex gpu_device_kernel_original(int i) {


    // some random data, depending on `i`, but for simplification say constant data
    cuDoubleComplex matrix[5*5] = {
            make_cuDoubleComplex(2.0*(i/69120 * 1000), 2.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(7.0, 0.0),
            make_cuDoubleComplex(4.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(8.0, 0.0),
            make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(3.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0),
            make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(2.0, 0.0),
            make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(6.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 2.0)
    };

    cuDoubleComplex det = det_5x5(matrix);

    return det;
}

__device__ cuDoubleComplex gpu_device_kernel_direct(int i) {


    // some random data, depending on `i`, but for simplification say constant data
    cuDoubleComplex matrix[5*5] = {
            make_cuDoubleComplex(2.0*(i/69120 * 1000), 2.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(7.0, 0.0),
            make_cuDoubleComplex(4.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(8.0, 0.0),
            make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(3.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0),
            make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(2.0, 0.0),
            make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(6.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 2.0)
    };

    cuDoubleComplex det = direct_det5(matrix);    

    return det;
}

__device__ cuDoubleComplex gpu_device_kernel_generated(int i) {


    // some random data, depending on `i`, but for simplification say constant data
    cuDoubleComplex matrix[5*5] = {
            make_cuDoubleComplex(2.0*(i/69120 * 1000), 2.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(7.0, 0.0),
            make_cuDoubleComplex(4.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(8.0, 0.0),
            make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(3.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0),
            make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(2.0, 0.0),
            make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(6.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 2.0)
    };

    cuDoubleComplex det = generated_det_5x5(matrix);    

    return det;
}


__global__ void gpu_global_kernel_original(cuDoubleComplex *d_res, int numGpuThreads) {
    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if(tid < numGpuThreads){

        // some other computations, irrelevant to current problem

        uint64_t total = 69120 * 1000;// 10 * 1000 * 1000; // this `total` is what I am referring to thousands 
        
        cuDoubleComplex det_sum, det;
        det_sum = make_cuDoubleComplex(0.0,0.0);
        for (int i = 0; i < total; i++) {
            det = gpu_device_kernel_original(i);
            det_sum = cuCadd(det_sum, det);
        }

        det_sum = cuCmul(det_sum, make_cuDoubleComplex(1.0/total,0.0) );

        d_res[tid] = det_sum;
    }
}


__global__ void gpu_global_kernel_direct(cuDoubleComplex *d_res, int numGpuThreads) {
    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if(tid < numGpuThreads){

        // some other computations, irrelevant to current problem

        uint64_t total = 69120 * 1000;// 10 * 1000 * 1000; // this `total` is what I am referring to thousands 
        
        cuDoubleComplex det_sum, det;
        det_sum = make_cuDoubleComplex(0.0,0.0);
        for (int i = 0; i < total; i++) {
            det = gpu_device_kernel_direct(i);
            det_sum = cuCadd(det_sum, det);
        }

        det_sum = cuCmul(det_sum, make_cuDoubleComplex(1.0/total,0.0) );

        d_res[tid] = det_sum;
    }
}


__global__ void gpu_global_kernel_generated(cuDoubleComplex *d_res, int numGpuThreads) {
    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if(tid < numGpuThreads){

        // some other computations, irrelevant to current problem

        uint64_t total = 69120 * 1000;// 10 * 1000 * 1000; // this `total` is what I am referring to thousands 
        
        cuDoubleComplex det_sum, det;
        det_sum = make_cuDoubleComplex(0.0,0.0);
        for (int i = 0; i < total; i++) {
            det = gpu_device_kernel_generated(i);
            det_sum = cuCadd(det_sum, det);
        }

        det_sum = cuCmul(det_sum, make_cuDoubleComplex(1.0/total,0.0) );

        d_res[tid] = det_sum;
    }
}

int main(int argc, char *argv[]) {

    if (argc < 2) {
        printf("Usage: %s <method> [<numGpuThreads>]\n", argv[0]);
        printf("method: 0 = original, 1 = direct, 2 = generated(no explicit cucomplex)\n");
        return 1;
    }

    int method;
    method = atoi(argv[1]);

    int numGpuThreads = 1;
    if(argc > 2){
        numGpuThreads = atoi(argv[2]);
    }
    
    cuDoubleComplex *res;
    res = (cuDoubleComplex*)malloc(sizeof(cuDoubleComplex) * numGpuThreads);
    memset(res, 0.0,sizeof(cuDoubleComplex) * numGpuThreads);


    cuDoubleComplex *d_res;
    cudaMalloc(&d_res, sizeof(cuDoubleComplex) * numGpuThreads );

    clock_t direct_start, direct_end;
    direct_start = clock();
    dim3 block = 128;
    dim3 grid = (numGpuThreads + block.x -1) / block.x;
    switch(method){
        case 0: printf("original\n"); gpu_global_kernel_original<<<grid,block>>>(d_res, numGpuThreads); break;
        case 1: printf("direct\n"); gpu_global_kernel_direct<<<grid,block>>>(d_res, numGpuThreads); break;
        case 2: printf("generated\n"); gpu_global_kernel_generated<<<grid,block>>>(d_res, numGpuThreads); break;
    }

    cudaDeviceSynchronize();

    direct_end = clock();

    //just copy the first result, not all numGpuThreads results
    cudaMemcpy(res, d_res, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost );

    printf("RESULT: %.16E + %.16E i\n", cuCreal(res[0]), cuCimag(res[0]));

    double timing = (double)((double)(direct_end - direct_start) / CLOCKS_PER_SEC);

    printf("time %.5f s\n", timing);

    cudaDeviceReset();

    return 0;
}

huangapple
  • 本文由 发表于 2023年5月28日 22:03:36
  • 转载请务必保留本文链接:https://go.coder-hub.com/76351879.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定