在 GPU 线程内计算小型复数矩阵的行列式。

huangapple go评论93阅读模式
英文:

Determinant of a small complex matrix within a GPU thread

问题

问题是计算复杂矩阵(5x5、4x4和3x3)的行列式,要求在特定线程内的设备函数中完成。效率至关重要,因为原问题涉及在每个线程的循环内计算数千个这样的矩阵。

这里是使用直接展开(参数为0运行)和余子式展开(参数为1运行)方法的工作示例:

  1. #include <stdio.h>
  2. #include <complex.h>
  3. #include <cstring>
  4. #include <cuda.h>
  5. #include <cuComplex.h>
  6. #include <cuda_runtime.h>
  7. #include <device_launch_parameters.h>
  8. #define N 5
  9. // ...(略过一部分代码)
  10. __host__ __device__ static __inline__ cuDoubleComplex cuCfms(cuDoubleComplex x,
  11. cuDoubleComplex y,
  12. cuDoubleComplex d){
  13. // ...(略过一部分代码)
  14. }
  15. __host__ __device__ static __inline__ cuDoubleComplex cuCaaaa(cuDoubleComplex x,
  16. cuDoubleComplex y,
  17. cuDoubleComplex u,
  18. cuDoubleComplex v) {
  19. // ...(略过一部分代码)
  20. }
  21. // ...(略过一部分代码)
  22. __device__ cuDoubleComplex direct_det5 (cuDoubleComplex *B){
  23. // ...(略过一部分代码)
  24. }
  25. __device__ cuDoubleComplex cuC_5pos (cuDoubleComplex a, cuDoubleComplex b,
  26. cuDoubleComplex c, cuDoubleComplex d,
  27. cuDoubleComplex e) {
  28. // ...(略过一部分代码)
  29. }
  30. __device__ cuDoubleComplex cuC_5neg (cuDoubleComplex a, cuDoubleComplex b,
  31. cuDoubleComplex c, cuDoubleComplex d,
  32. cuDoubleComplex e) {
  33. // ...(略过一部分代码)
  34. }
  35. // ...(略过一部分代码)
  36. __device__ cuDoubleComplex det_5x5(cuDoubleComplex *matrix) {
  37. // ...(略过一部分代码)
  38. }
  39. // ...(略过一部分代码)
  40. __device__ cuDoubleComplex gpu_device_kernel(int i, int method) {
  41. // ...(略过一部分代码)
  42. }
  43. // ...(略过一部分代码)
  44. __global__ void gpu_global_kernel(cuDoubleComplex *d_res, int method) {
  45. // ...(略过一部分代码)
  46. }
  47. int main(int argc, char *argv[]) {
  48. // ...(略过一部分代码)
  49. }

然而,我在考虑这些实现是否真的高效。 我正在寻找一种替代方法,可以在设备函数中计算一般矩阵的行列式。 我提供的示例是最简单直接的方法之一,但我正在寻找另一种方法,比如LU/Cholesky分解,可以从设备函数中调用它。 不幸的是,诸如LAPACK之类的包对于CUDA不可用,但也许来自其他一些软件包的类似兼容函数可以在CUDA设备环境中使用。 对于可以在CUDA设备上使用的类似函数的任何建议或示例将不胜感激!

英文:

The problem is to compute the determinant of a complex matrix (5x5, 4x4, and 3x3) from a device function within a particular thread. Efficiency is crucial, as the original problem involves computing thousands of such matrices in a for loop within each thread.

Here is a working example using direct expansion (run with arg: 0) and cofactor expansion (run with arg: 1) methods:

  1. #include <stdio.h>
  2. #include <complex.h>
  3. #include <cstring>
  4. #include <cuda.h>
  5. #include <cuComplex.h>
  6. #include <cuda_runtime.h>
  7. #include <device_launch_parameters.h>
  8. #define N 5
  9. __host__ __device__ static __inline__ cuDoubleComplex cuCfms( cuDoubleComplex x,
  10. cuDoubleComplex y,
  11. cuDoubleComplex d){
  12. double real_res;
  13. double imag_res;
  14. real_res = (cuCreal(x) * cuCreal(y)) - cuCreal(d);
  15. imag_res = (cuCreal(x) * cuCimag(y)) - cuCimag(d);
  16. real_res = -(cuCimag(x) * cuCimag(y)) + real_res;
  17. imag_res = (cuCimag(x) * cuCreal(y)) + imag_res;
  18. return make_cuDoubleComplex(real_res, imag_res);
  19. }
  20. __host__ __device__ static __inline__ cuDoubleComplex cuCaaaa(cuDoubleComplex x,
  21. cuDoubleComplex y,
  22. cuDoubleComplex u,
  23. cuDoubleComplex v) {
  24. return make_cuDoubleComplex (cuCreal(x) + cuCreal(y) + cuCreal(u) + cuCreal(v),
  25. cuCimag(x) + cuCimag(y) + cuCimag(u) + cuCimag(v));
  26. }
  27. __host__ __device__ static __inline__ cuDoubleComplex cuCasas(cuDoubleComplex x,
  28. cuDoubleComplex y,
  29. cuDoubleComplex u,
  30. cuDoubleComplex v) {
  31. return make_cuDoubleComplex (cuCreal(x) - cuCreal(y) + cuCreal(u) - cuCreal(v),
  32. cuCimag(x) - cuCimag(y) + cuCimag(u) - cuCimag(v));
  33. }
  34. __host__ __device__ static __inline__ cuDoubleComplex cuCasasa(cuDoubleComplex x,
  35. cuDoubleComplex y,
  36. cuDoubleComplex u,
  37. cuDoubleComplex v,
  38. cuDoubleComplex w) {
  39. return make_cuDoubleComplex (cuCreal(x) - cuCreal(y) + cuCreal(u) - cuCreal(v) + cuCreal(w),
  40. cuCimag(x) - cuCimag(y) + cuCimag(u) - cuCimag(v) + cuCimag(w) );
  41. }
  42. __device__ cuDoubleComplex cuC_5pos (cuDoubleComplex a, cuDoubleComplex b,
  43. cuDoubleComplex c, cuDoubleComplex d,
  44. cuDoubleComplex e) {
  45. // B[12]*(B[18]*B[24] - B[19]*B[23])
  46. //a*b*c -d*e
  47. return cuCmul(a, cuCfms(b,c, cuCmul(d,e)));
  48. }
  49. __device__ cuDoubleComplex cuC_5neg (cuDoubleComplex a, cuDoubleComplex b,
  50. cuDoubleComplex c, cuDoubleComplex d,
  51. cuDoubleComplex e) {
  52. // B[12]*(B[18]*B[24] - B[19]*B[23])
  53. //a*b*c -d*e
  54. return cuCmul(cuCmul(make_cuDoubleComplex(-1.0,0.0),a), cuCfms(b,c, cuCmul(d,e)));
  55. }
  56. __device__ cuDoubleComplex direct_det5 (cuDoubleComplex *B){
  57. cuDoubleComplex det;
  58. cuDoubleComplex det1 = make_cuDoubleComplex(0.0,0.0);
  59. cuDoubleComplex det2 = make_cuDoubleComplex(0.0,0.0);
  60. cuDoubleComplex det3 = make_cuDoubleComplex(0.0,0.0);
  61. cuDoubleComplex det4 = make_cuDoubleComplex(0.0,0.0);
  62. cuDoubleComplex det5 = make_cuDoubleComplex(0.0,0.0);
  63. cuDoubleComplex det11 = cuCmul(B[0],cuCmul(B[6],cuCadd( cuC_5pos(B[12],B[18],B[24],B[19],B[23]),
  64. cuCadd( cuC_5neg(B[13],B[17],B[24],B[22],B[19]),
  65. cuC_5pos(B[14],B[17],B[23],B[22],B[18]) ) ) ) );
  66. cuDoubleComplex det12 = cuCmul(B[0],cuCmul(B[11],cuCadd(cuC_5pos(B[7],B[18],B[24],B[19],B[23]),
  67. cuCadd( cuC_5neg(B[8],B[17],B[24],B[22],B[19]),
  68. cuC_5pos(B[9],B[17],B[23],B[22],B[18]) ) ) ) );
  69. cuDoubleComplex det13 = cuCmul(B[0],cuCmul(B[16],cuCadd(cuC_5pos(B[7],B[13],B[24],B[23],B[14]),
  70. cuCadd( cuC_5neg(B[8],B[12],B[24],B[22],B[14]),
  71. cuC_5pos(B[9],B[12],B[23],B[22],B[13]) ) ) ) );
  72. cuDoubleComplex det14 = cuCmul(B[0],cuCmul(B[21],cuCadd(cuC_5pos(B[7],B[13],B[19],B[18],B[14]),
  73. cuCadd( cuC_5neg(B[8],B[12],B[19],B[17],B[14]),
  74. cuC_5pos(B[9],B[12],B[18],B[13],B[17]) ) ) ) );
  75. det1 = cuCasas(det11, det12, det13, det14);
  76. cuDoubleComplex det21 = cuCmul(B[1*5+0],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[12],B[18],B[24],B[19],B[23]), cuCadd( cuC_5neg(B[13],B[17],B[24],B[22],B[19]), cuC_5pos(B[14],B[17],B[23],B[22],B[18]) ) ) ) );
  77. cuDoubleComplex det22 = cuCmul(B[1*5+0],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[2],B[18],B[24],B[19],B[23]), cuCadd( cuC_5neg(B[3],B[17],B[24],B[22],B[19]), cuC_5pos(B[4],B[17],B[23],B[22],B[18]) ) ) ) );
  78. cuDoubleComplex det23 = cuCmul(B[1*5+0],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[2],B[13],B[24],B[23],B[14]), cuCadd( cuC_5neg(B[3],B[12],B[24],B[22],B[14]), cuC_5pos(B[4],B[12],B[23],B[22],B[13]) ) ) ) );
  79. cuDoubleComplex det24 = cuCmul(B[1*5+0],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[2],B[13],B[19],B[18],B[14]), cuCadd( cuC_5neg(B[3],B[12],B[19],B[17],B[14]), cuC_5pos(B[4],B[12],B[18],B[13],B[17]) ) ) ) );
  80. det2 = cuCasas(det21, det22, det23, det24);
  81. cuDoubleComplex det31 = cuCmul(B[10],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[3*5+3],B[4*5+4],B[3*5+4],B[4*5+3]),
  82. cuCadd( cuC_5neg(B[1*5+3],B[3*5+2],B[4*5+4],B[4*5+2],B[3*5+4]),
  83. cuC_5pos(B[1*5+4],B[3*5+2],B[4*5+3],B[4*5+2],B[3*5+3]) ) ) ) );
  84. cuDoubleComplex det32 = cuCmul(B[10],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[3*5+3],B[4*5+4],B[3*5+4],B[4*5+3]),
  85. cuCadd( cuC_5neg(B[0*5+3],B[3*5+2],B[4*5+4],B[4*5+2],B[3*5+4]),
  86. cuC_5pos(B[0*5+4],B[3*5+2],B[4*5+3],B[4*5+2],B[3*5+3]) ) ) ) );
  87. cuDoubleComplex det33 = cuCmul(B[10],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[4*5+4],B[4*5+3],B[1*5+4]),
  88. cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[4*5+4],B[4*5+2],B[1*5+4]),
  89. cuC_5pos(B[0*5+4],B[1*5+2],B[4*5+3],B[4*5+2],B[1*5+3]) ) ) ) );
  90. cuDoubleComplex det34 = cuCmul(B[10],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[3*5+4],B[3*5+3],B[1*5+4]),
  91. cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[3*5+4],B[3*5+2],B[1*5+4]),
  92. cuC_5pos(B[0*5+4],B[1*5+2],B[3*5+3],B[1*5+3],B[3*5+2]) ) ) ) );
  93. det3 = cuCasas(det31, det32, det33, det34);
  94. cuDoubleComplex det41 = cuCmul(B[15],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[2*5+3],B[4*5+4],B[2*5+4],B[4*5+3]),
  95. cuCadd( cuC_5neg(B[1*5+3],B[2*5+2],B[4*5+4],B[4*5+2],B[2*5+4]),
  96. cuC_5pos(B[1*5+4],B[2*5+2],B[4*5+3],B[4*5+2],B[2*5+3]) ) ) ) );
  97. cuDoubleComplex det42 = cuCmul(B[15],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[2*5+3],B[4*5+4],B[2*5+4],B[4*5+3]),
  98. cuCadd( cuC_5neg(B[0*5+3],B[2*5+2],B[4*5+4],B[4*5+2],B[2*5+4]),
  99. cuC_5pos(B[0*5+4],B[2*5+2],B[4*5+3],B[4*5+2],B[2*5+3]) ) ) ) );
  100. cuDoubleComplex det43 = cuCmul(B[15],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[4*5+4],B[4*5+3],B[1*5+4]),
  101. cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[4*5+4],B[4*5+2],B[1*5+4]),
  102. cuC_5pos(B[0*5+4],B[1*5+2],B[4*5+3],B[4*5+2],B[1*5+3]) ) ) ) );
  103. cuDoubleComplex det44 = cuCmul(B[15],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[2*5+4],B[2*5+3],B[1*5+4]),
  104. cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[2*5+4],B[2*5+2],B[1*5+4]),
  105. cuC_5pos(B[0*5+4],B[1*5+2],B[2*5+3],B[1*5+3],B[2*5+2]) ) ) ) );
  106. det4 = cuCasas(det41, det42, det43, det44);
  107. cuDoubleComplex det51 = cuCmul(B[20],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[2*5+3],B[3*5+4],B[2*5+4],B[3*5+3]),
  108. cuCadd( cuC_5neg(B[1*5+3],B[2*5+2],B[3*5+4],B[3*5+2],B[2*5+4]),
  109. cuC_5pos(B[1*5+4],B[2*5+2],B[3*5+3],B[3*5+2],B[2*5+3]) ) ) ) );
  110. cuDoubleComplex det52 = cuCmul(B[20],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[2*5+3],B[3*5+4],B[2*5+4],B[3*5+3]),
  111. cuCadd( cuC_5neg(B[0*5+3],B[2*5+2],B[3*5+4],B[3*5+2],B[2*5+4]),
  112. cuC_5pos(B[0*5+4],B[2*5+2],B[3*5+3],B[3*5+2],B[2*5+3]) ) ) ) );
  113. cuDoubleComplex det53 = cuCmul(B[20],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[3*5+4],B[3*5+3],B[1*5+4]),
  114. cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[3*5+4],B[3*5+2],B[1*5+4]),
  115. cuC_5pos(B[0*5+4],B[1*5+2],B[3*5+3],B[3*5+2],B[1*5+3]) ) ) ) );
  116. cuDoubleComplex det54 = cuCmul(B[20],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[2*5+4],B[2*5+3],B[1*5+4]),
  117. cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[2*5+4],B[2*5+2],B[1*5+4]),
  118. cuC_5pos(B[0*5+4],B[1*5+2],B[2*5+3],B[1*5+3],B[2*5+2]) ) ) ) );
  119. det5 = cuCasas(det51, det52, det53, det54);
  120. det = cuCasasa(det1,det2,det3,det4,det5);
  121. return det;
  122. }
  123. #define N 5
  124. __device__ cuDoubleComplex det_4x4(cuDoubleComplex *matrix) {
  125. cuDoubleComplex det = make_cuDoubleComplex(0.0,0.0);
  126. cuDoubleComplex det1 = make_cuDoubleComplex(0.0,0.0);
  127. cuDoubleComplex det2 = make_cuDoubleComplex(0.0,0.0);
  128. cuDoubleComplex det3 = make_cuDoubleComplex(0.0,0.0);
  129. cuDoubleComplex det4 = make_cuDoubleComplex(0.0,0.0);
  130. // Calculate the determinant using cofactor expansions
  131. det1 = cuCmul(matrix[0],
  132. cuCadd( cuCsub( cuCmul( matrix[5],
  133. cuCsub( cuCmul(matrix[10],matrix[15]),
  134. cuCmul(matrix[11],matrix[14]) ))
  135. ,cuCmul(matrix[6],
  136. cuCsub( cuCmul(matrix[9],matrix[15]),
  137. cuCmul(matrix[11],matrix[13]) ) ) )
  138. ,cuCmul(matrix[7],
  139. cuCsub( cuCmul(matrix[9],matrix[14]),
  140. cuCmul(matrix[10],matrix[13]) ) ) ) );
  141. det2 = cuCmul(matrix[1],
  142. cuCadd( cuCsub( cuCmul( matrix[4],
  143. cuCsub( cuCmul(matrix[10],matrix[15]),
  144. cuCmul(matrix[11],matrix[14]) ))
  145. ,cuCmul(matrix[6],
  146. cuCsub( cuCmul(matrix[8],matrix[15]),
  147. cuCmul(matrix[11],matrix[12]) ) ) )
  148. ,cuCmul(matrix[7],
  149. cuCsub( cuCmul(matrix[8],matrix[14]),
  150. cuCmul(matrix[10],matrix[12]) ) ) ) );
  151. det3 = cuCmul(matrix[2],
  152. cuCadd( cuCsub( cuCmul( matrix[4],
  153. cuCsub( cuCmul(matrix[9],matrix[15]),
  154. cuCmul(matrix[11],matrix[13]) ))
  155. ,cuCmul(matrix[5],
  156. cuCsub( cuCmul(matrix[8],matrix[15]),
  157. cuCmul(matrix[11],matrix[12]) ) ) )
  158. ,cuCmul(matrix[7],
  159. cuCsub( cuCmul(matrix[8],matrix[13]),
  160. cuCmul(matrix[9],matrix[12]) ) ) ) );
  161. det4 = cuCmul(matrix[3],
  162. cuCadd( cuCsub( cuCmul( matrix[4],
  163. cuCsub( cuCmul(matrix[9],matrix[14]),
  164. cuCmul(matrix[10],matrix[13]) ))
  165. ,cuCmul(matrix[5],
  166. cuCsub( cuCmul(matrix[8],matrix[14]),
  167. cuCmul(matrix[10],matrix[12]) ) ) )
  168. ,cuCmul(matrix[6],
  169. cuCsub( cuCmul(matrix[8],matrix[13]),
  170. cuCmul(matrix[9],matrix[12]) ) ) ) );
  171. det = cuCsub( cuCadd(det1,det3), cuCadd(det2,det4) );
  172. return det;
  173. }
  174. __device__ cuDoubleComplex det_5x5(cuDoubleComplex *matrix) {
  175. cuDoubleComplex det = make_cuDoubleComplex(0.0,0.0);
  176. // Calculate the determinant using cofactor expansions
  177. for (int col = 0; col < N; col++) {
  178. cuDoubleComplex submatrix[N-1][N-1];
  179. int subrow = 0, subcol = 0;
  180. // Create the submatrix by excluding the current row and column
  181. for (int row = 1; row < N; row++) {
  182. for (int c = 0; c < N; c++) {
  183. if (c == col) {
  184. continue;
  185. }
  186. submatrix[subrow][subcol] = matrix[row * N + c];
  187. subcol++;
  188. }
  189. subrow++;
  190. subcol = 0;
  191. }
  192. // Calculate the cofactor
  193. cuDoubleComplex det_sign = make_cuDoubleComplex((col % 2 == 0 ? 1 : -1),0.0);
  194. cuDoubleComplex cofactor = cuCmul( cuCmul(det_sign,matrix[col]), det_4x4(&submatrix[0][0]));
  195. // Accumulate the cofactors to calculate the determinant
  196. det = cuCadd(det, cofactor);
  197. }
  198. return det;
  199. }
  200. __device__ cuDoubleComplex gpu_device_kernel(int i, int method) {
  201. // some random data, depending on `i`, but for simplification say constant data
  202. cuDoubleComplex matrix[5*5] = {
  203. make_cuDoubleComplex(2.0*(i/69120 * 1000), 2.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(7.0, 0.0),
  204. make_cuDoubleComplex(4.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(8.0, 0.0),
  205. make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(3.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0),
  206. make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(2.0, 0.0),
  207. make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(6.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 2.0)
  208. };
  209. cuDoubleComplex cf_sign[5]={make_cuDoubleComplex(1.0,0.0),
  210. make_cuDoubleComplex(-1.0,0.0),
  211. make_cuDoubleComplex(1.0,0.0),
  212. make_cuDoubleComplex(-1.0,0.0),
  213. make_cuDoubleComplex(1.0,0.0),
  214. };
  215. cuDoubleComplex det;
  216. if (method == 0) {
  217. det = direct_det5(matrix);
  218. } else {
  219. det = det_5x5(matrix);
  220. }
  221. return det;
  222. }
  223. __global__ void gpu_global_kernel(cuDoubleComplex *d_res, int method) {
  224. // some other computations, irrelevant to current problem
  225. uint64_t total = 69120 * 1000;// 10 * 1000 * 1000; // this `total` is what I am referring to thousands
  226. cuDoubleComplex det_sum, det;
  227. det_sum = make_cuDoubleComplex(0.0,0.0);
  228. for (int i = 0; i < total; i++) {
  229. det = gpu_device_kernel(i, method);
  230. det_sum = cuCadd(det_sum, det);
  231. }
  232. det_sum = cuCmul(det_sum, make_cuDoubleComplex(1.0/total,0.0) );
  233. d_res[0] = det_sum;
  234. return;
  235. }
  236. int main(int argc, char *argv[]) {
  237. if (argc != 2) {
  238. printf("Usage: %s <num1> <num2>\n", argv[0]);
  239. return 1;
  240. }
  241. int method;
  242. method = atoi(argv[1]);
  243. double complex *res;
  244. res = (double complex*)malloc(sizeof(double complex));
  245. memset(res, 0.0,sizeof(double complex));
  246. cuDoubleComplex *d_res;
  247. cudaMalloc(&d_res, sizeof(cuDoubleComplex) );
  248. int *d_method;
  249. cudaMalloc(&d_method, sizeof(int) );
  250. cudaMemcpy(d_method, &method, sizeof(int), cudaMemcpyHostToDevice);
  251. clock_t direct_start, direct_end;
  252. direct_start = clock();
  253. gpu_global_kernel <<<1,1>>>(d_res, *d_method);
  254. cudaDeviceSynchronize();
  255. direct_end = clock();
  256. cudaMemcpy(res, d_res, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost );
  257. printf("RESULT: %.16E + %.16E i\n", creal(res[0]), cimag(res[0]));
  258. double direct_time = (double)((double)(direct_end - direct_start) / CLOCKS_PER_SEC);
  259. if (method == 0) {
  260. printf("DIRECT EXEC: %.5f s \n", direct_time);
  261. } else {
  262. printf("COFACTOR EXEC: %.5f s \n", direct_time);
  263. }
  264. cudaDeviceReset();
  265. return 0;
  266. }

However, I am wondering if these implementation is even efficient.

I am looking for an alternative method to compute the determinant of a general matrix inside a device function. The example I have provided is one of the most straightforward methods, but an alternative approach, such as LU/Cholesky decomposition, that can be called from the device function is what I am seeking. Unfortunately, packages like LAPACK are not available for CUDA, but maybe a similar compatible function from some other packages could work within the CUDA device environment.

Any suggestions or function examples would be greatly appreciated!

答案1

得分: 1

由于您包含了一个C++头文件(<cstring>),我假设这是一段C++代码,尽管它看起来非常像C语言。

我直接采用您提供的用于扩展的生成代码,通过简单地定义cuDoubleComplex*+-运算符来实现它。我在下面的代码中将此方法称为“generated”。此外,我对您的代码进行了一些轻微的修改。我添加了一个程序参数来指定GPU线程的数量,以确保GPU的充分利用。我还为每种方法使用了单独的内核。方法0是原始方法,方法1是您的直接方法,方法2是我的“generated”方法。

我使用以下命令编译了该代码:nvcc -std=c++14 -O3 -arch=sm_70 main.cu -o main,并在V100 Pcie上针对32000个GPU线程看到以下时间。

配置 时间
原始 145.01487秒
直接 67.07063秒
生成 24.02771秒

以下是代码:

  1. #include <stdio.h>
  2. #include <complex.h>
  3. #include <cstring>
  4. #include <cuda.h>
  5. #include <cuComplex.h>
  6. #include <cuda_runtime.h>
  7. #include <device_launch_parameters.h>
  8. #define N 5
  9. // ... 其余代码 ...

请注意,我仅返回代码的部分翻译。如果您需要更多信息或有其他问题,请随时提出。

英文:

Since you are including a C++ header (<cstring>) I am assuming its a C++ code, even though it looks very C-like.

I took the general generated code that you provided for expansion and implemented it directly as is by simply defining the operators *,+,- for the cuDoubleComplex .

I call this method "generated" in the code below. Additionally, I made some slight modifications to your code. I added a program parameter to specify the number of GPU threads to have a proper GPU utilization. I also use separate kernels for each method. Method 0 is the original method, method 1 is your direct method, method 2 is my "generated" method.

I compiled the code with nvcc -std=c++14 -O3 -arch=sm_70 main.cu -o main and see the following timings on V100 Pcie for 32000 GPU threads.

Configuration Timing
original 145.01487 s
direct 67.07063 s
generated 24.02771 s

The code:

  1. #include <stdio.h>
  2. #include <complex.h>
  3. #include <cstring>
  4. #include <cuda.h>
  5. #include <cuComplex.h>
  6. #include <cuda_runtime.h>
  7. #include <device_launch_parameters.h>
  8. #define N 5
  9. __host__ __device__
  10. bool operator==(cuDoubleComplex l, cuDoubleComplex r){
  11. return cuCreal(l) == cuCreal(r) && cuCimag(l) == cuCimag(r);
  12. }
  13. __host__ __device__
  14. cuDoubleComplex operator*(cuDoubleComplex l, cuDoubleComplex r){
  15. return cuCmul(l,r);
  16. }
  17. __host__ __device__
  18. cuDoubleComplex operator+(cuDoubleComplex l, cuDoubleComplex r){
  19. return cuCadd(l,r);
  20. }
  21. __host__ __device__
  22. cuDoubleComplex operator-(cuDoubleComplex l, cuDoubleComplex r){
  23. return cuCsub(l,r);
  24. }
  25. __host__ __device__
  26. cuDoubleComplex operator-(cuDoubleComplex l){
  27. return make_cuDoubleComplex(-cuCreal(l), -cuCimag(l));
  28. }
  29. template<class Matrix>
  30. __device__
  31. cuDoubleComplex generated_det_5x5(Matrix B){
  32. return
  33. B[0*5+0]*(B[1*5+1]*(B[2*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[2*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[2*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))-B[2*5+1]*(B[1*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[1*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[1*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))+B[3*5+1]*(B[1*5+2]*(B[2*5+3]*B[4*5+4]-B[4*5+3]*B[2*5+4])-B[1*5+3]*(B[2*5+2]*B[4*5+4]-B[4*5+2]*B[2*5+4])+B[1*5+4]*(B[2*5+2]*B[4*5+3]-B[4*5+2]*B[2*5+3]))-B[4*5+1]*(B[1*5+2]*(B[2*5+3]*B[3*5+4]-B[3*5+3]*B[2*5+4])-B[1*5+3]*(B[2*5+2]*B[3*5+4]-B[3*5+2]*B[2*5+4])+B[1*5+4]*(B[2*5+2]*B[3*5+3]-B[2*5+3]*B[3*5+2])))
  34. - B[1*5+0]*(B[0*5+1]*(B[2*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[2*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[2*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))-B[2*5+1]*(B[0*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[0*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[0*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))+B[3*5+1]*(B[0*5+2]*(B[2*5+3]*B[4*5+4]-B[4*5+3]*B[2*5+4])-B[0*5+3]*(B[2*5+2]*B[4*5+4]-B[4*5+2]*B[2*5+4])+B[0*5+4]*(B[2*5+2]*B[4*5+3]-B[4*5+2]*B[2*5+3]))-B[4*5+1]*(B[0*5+2]*(B[2*5+3]*B[3*5+4]-B[3*5+3]*B[2*5+4])-B[0*5+3]*(B[2*5+2]*B[3*5+4]-B[3*5+2]*B[2*5+4])+B[0*5+4]*(B[2*5+2]*B[3*5+3]-B[2*5+3]*B[3*5+2])))
  35. + B[2*5+0]*(B[0*5+1]*(B[1*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[1*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[1*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))-B[1*5+1]*(B[0*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[0*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[0*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))+B[3*5+1]*(B[0*5+2]*(B[1*5+3]*B[4*5+4]-B[4*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[4*5+4]-B[4*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[4*5+3]-B[4*5+2]*B[1*5+3]))-B[4*5+1]*(B[0*5+2]*(B[1*5+3]*B[3*5+4]-B[3*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[3*5+4]-B[3*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[3*5+3]-B[1*5+3]*B[3*5+2])))
  36. - B[3*5+0]*(B[0*5+1]*(B[1*5+2]*(B[2*5+3]*B[4*5+4]-B[2*5+4]*B[4*5+3])-B[1*5+3]*(B[2*5+2]*B[4*5+4]-B[4*5+2]*B[2*5+4])+B[1*5+4]*(B[2*5+2]*B[4*5+3]-B[4*5+2]*B[2*5+3]))-B[1*5+1]*(B[0*5+2]*(B[2*5+3]*B[4*5+4]-B[2*5+4]*B[4*5+3])-B[0*5+3]*(B[2*5+2]*B[4*5+4]-B[4*5+2]*B[2*5+4])+B[0*5+4]*(B[2*5+2]*B[4*5+3]-B[4*5+2]*B[2*5+3]))+B[2*5+1]*(B[0*5+2]*(B[1*5+3]*B[4*5+4]-B[4*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[4*5+4]-B[4*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[4*5+3]-B[4*5+2]*B[1*5+3]))-B[4*5+1]*(B[0*5+2]*(B[1*5+3]*B[2*5+4]-B[2*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[2*5+4]-B[2*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[2*5+3]-B[1*5+3]*B[2*5+2])))
  37. + B[4*5+0]*(B[0*5+1]*(B[1*5+2]*(B[2*5+3]*B[3*5+4]-B[2*5+4]*B[3*5+3])-B[1*5+3]*(B[2*5+2]*B[3*5+4]-B[3*5+2]*B[2*5+4])+B[1*5+4]*(B[2*5+2]*B[3*5+3]-B[3*5+2]*B[2*5+3]))-B[1*5+1]*(B[0*5+2]*(B[2*5+3]*B[3*5+4]-B[2*5+4]*B[3*5+3])-B[0*5+3]*(B[2*5+2]*B[3*5+4]-B[3*5+2]*B[2*5+4])+B[0*5+4]*(B[2*5+2]*B[3*5+3]-B[3*5+2]*B[2*5+3]))+B[2*5+1]*(B[0*5+2]*(B[1*5+3]*B[3*5+4]-B[3*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[3*5+4]-B[3*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[3*5+3]-B[3*5+2]*B[1*5+3]))-B[3*5+1]*(B[0*5+2]*(B[1*5+3]*B[2*5+4]-B[2*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[2*5+4]-B[2*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[2*5+3]-B[1*5+3]*B[2*5+2])));
  38. }
  39. __host__ __device__ static __inline__ cuDoubleComplex cuCfms( cuDoubleComplex x,
  40. cuDoubleComplex y,
  41. cuDoubleComplex d){
  42. double real_res;
  43. double imag_res;
  44. real_res = (cuCreal(x) * cuCreal(y)) - cuCreal(d);
  45. imag_res = (cuCreal(x) * cuCimag(y)) - cuCimag(d);
  46. real_res = -(cuCimag(x) * cuCimag(y)) + real_res;
  47. imag_res = (cuCimag(x) * cuCreal(y)) + imag_res;
  48. return make_cuDoubleComplex(real_res, imag_res);
  49. }
  50. __host__ __device__ static __inline__ cuDoubleComplex cuCaaaa(cuDoubleComplex x,
  51. cuDoubleComplex y,
  52. cuDoubleComplex u,
  53. cuDoubleComplex v) {
  54. return make_cuDoubleComplex (cuCreal(x) + cuCreal(y) + cuCreal(u) + cuCreal(v),
  55. cuCimag(x) + cuCimag(y) + cuCimag(u) + cuCimag(v));
  56. }
  57. __host__ __device__ static __inline__ cuDoubleComplex cuCasas(cuDoubleComplex x,
  58. cuDoubleComplex y,
  59. cuDoubleComplex u,
  60. cuDoubleComplex v) {
  61. return make_cuDoubleComplex (cuCreal(x) - cuCreal(y) + cuCreal(u) - cuCreal(v),
  62. cuCimag(x) - cuCimag(y) + cuCimag(u) - cuCimag(v));
  63. }
  64. __host__ __device__ static __inline__ cuDoubleComplex cuCasasa(cuDoubleComplex x,
  65. cuDoubleComplex y,
  66. cuDoubleComplex u,
  67. cuDoubleComplex v,
  68. cuDoubleComplex w) {
  69. return make_cuDoubleComplex (cuCreal(x) - cuCreal(y) + cuCreal(u) - cuCreal(v) + cuCreal(w),
  70. cuCimag(x) - cuCimag(y) + cuCimag(u) - cuCimag(v) + cuCimag(w) );
  71. }
  72. __device__ cuDoubleComplex cuC_5pos (cuDoubleComplex a, cuDoubleComplex b,
  73. cuDoubleComplex c, cuDoubleComplex d,
  74. cuDoubleComplex e) {
  75. // B[12]*(B[18]*B[24] - B[19]*B[23])
  76. //a*b*c -d*e
  77. return cuCmul(a, cuCfms(b,c, cuCmul(d,e)));
  78. }
  79. __device__ cuDoubleComplex cuC_5neg (cuDoubleComplex a, cuDoubleComplex b,
  80. cuDoubleComplex c, cuDoubleComplex d,
  81. cuDoubleComplex e) {
  82. // B[12]*(B[18]*B[24] - B[19]*B[23])
  83. //a*b*c -d*e
  84. return cuCmul(cuCmul(make_cuDoubleComplex(-1.0,0.0),a), cuCfms(b,c, cuCmul(d,e)));
  85. }
  86. __device__ cuDoubleComplex direct_det5 (cuDoubleComplex *B){
  87. cuDoubleComplex det;
  88. cuDoubleComplex det1 = make_cuDoubleComplex(0.0,0.0);
  89. cuDoubleComplex det2 = make_cuDoubleComplex(0.0,0.0);
  90. cuDoubleComplex det3 = make_cuDoubleComplex(0.0,0.0);
  91. cuDoubleComplex det4 = make_cuDoubleComplex(0.0,0.0);
  92. cuDoubleComplex det5 = make_cuDoubleComplex(0.0,0.0);
  93. cuDoubleComplex det11 = cuCmul(B[0],cuCmul(B[6],cuCadd( cuC_5pos(B[12],B[18],B[24],B[19],B[23]),
  94. cuCadd( cuC_5neg(B[13],B[17],B[24],B[22],B[19]),
  95. cuC_5pos(B[14],B[17],B[23],B[22],B[18]) ) ) ) );
  96. cuDoubleComplex det12 = cuCmul(B[0],cuCmul(B[11],cuCadd(cuC_5pos(B[7],B[18],B[24],B[19],B[23]),
  97. cuCadd( cuC_5neg(B[8],B[17],B[24],B[22],B[19]),
  98. cuC_5pos(B[9],B[17],B[23],B[22],B[18]) ) ) ) );
  99. cuDoubleComplex det13 = cuCmul(B[0],cuCmul(B[16],cuCadd(cuC_5pos(B[7],B[13],B[24],B[23],B[14]),
  100. cuCadd( cuC_5neg(B[8],B[12],B[24],B[22],B[14]),
  101. cuC_5pos(B[9],B[12],B[23],B[22],B[13]) ) ) ) );
  102. cuDoubleComplex det14 = cuCmul(B[0],cuCmul(B[21],cuCadd(cuC_5pos(B[7],B[13],B[19],B[18],B[14]),
  103. cuCadd( cuC_5neg(B[8],B[12],B[19],B[17],B[14]),
  104. cuC_5pos(B[9],B[12],B[18],B[13],B[17]) ) ) ) );
  105. det1 = cuCasas(det11, det12, det13, det14);
  106. cuDoubleComplex det21 = cuCmul(B[1*5+0],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[12],B[18],B[24],B[19],B[23]), cuCadd( cuC_5neg(B[13],B[17],B[24],B[22],B[19]), cuC_5pos(B[14],B[17],B[23],B[22],B[18]) ) ) ) );
  107. cuDoubleComplex det22 = cuCmul(B[1*5+0],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[2],B[18],B[24],B[19],B[23]), cuCadd( cuC_5neg(B[3],B[17],B[24],B[22],B[19]), cuC_5pos(B[4],B[17],B[23],B[22],B[18]) ) ) ) );
  108. cuDoubleComplex det23 = cuCmul(B[1*5+0],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[2],B[13],B[24],B[23],B[14]), cuCadd( cuC_5neg(B[3],B[12],B[24],B[22],B[14]), cuC_5pos(B[4],B[12],B[23],B[22],B[13]) ) ) ) );
  109. cuDoubleComplex det24 = cuCmul(B[1*5+0],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[2],B[13],B[19],B[18],B[14]), cuCadd( cuC_5neg(B[3],B[12],B[19],B[17],B[14]), cuC_5pos(B[4],B[12],B[18],B[13],B[17]) ) ) ) );
  110. det2 = cuCasas(det21, det22, det23, det24);
  111. cuDoubleComplex det31 = cuCmul(B[10],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[3*5+3],B[4*5+4],B[3*5+4],B[4*5+3]),
  112. cuCadd( cuC_5neg(B[1*5+3],B[3*5+2],B[4*5+4],B[4*5+2],B[3*5+4]),
  113. cuC_5pos(B[1*5+4],B[3*5+2],B[4*5+3],B[4*5+2],B[3*5+3]) ) ) ) );
  114. cuDoubleComplex det32 = cuCmul(B[10],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[3*5+3],B[4*5+4],B[3*5+4],B[4*5+3]),
  115. cuCadd( cuC_5neg(B[0*5+3],B[3*5+2],B[4*5+4],B[4*5+2],B[3*5+4]),
  116. cuC_5pos(B[0*5+4],B[3*5+2],B[4*5+3],B[4*5+2],B[3*5+3]) ) ) ) );
  117. cuDoubleComplex det33 = cuCmul(B[10],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[4*5+4],B[4*5+3],B[1*5+4]),
  118. cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[4*5+4],B[4*5+2],B[1*5+4]),
  119. cuC_5pos(B[0*5+4],B[1*5+2],B[4*5+3],B[4*5+2],B[1*5+3]) ) ) ) );
  120. cuDoubleComplex det34 = cuCmul(B[10],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[3*5+4],B[3*5+3],B[1*5+4]),
  121. cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[3*5+4],B[3*5+2],B[1*5+4]),
  122. cuC_5pos(B[0*5+4],B[1*5+2],B[3*5+3],B[1*5+3],B[3*5+2]) ) ) ) );
  123. det3 = cuCasas(det31, det32, det33, det34);
  124. cuDoubleComplex det41 = cuCmul(B[15],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[2*5+3],B[4*5+4],B[2*5+4],B[4*5+3]),
  125. cuCadd( cuC_5neg(B[1*5+3],B[2*5+2],B[4*5+4],B[4*5+2],B[2*5+4]),
  126. cuC_5pos(B[1*5+4],B[2*5+2],B[4*5+3],B[4*5+2],B[2*5+3]) ) ) ) );
  127. cuDoubleComplex det42 = cuCmul(B[15],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[2*5+3],B[4*5+4],B[2*5+4],B[4*5+3]),
  128. cuCadd( cuC_5neg(B[0*5+3],B[2*5+2],B[4*5+4],B[4*5+2],B[2*5+4]),
  129. cuC_5pos(B[0*5+4],B[2*5+2],B[4*5+3],B[4*5+2],B[2*5+3]) ) ) ) );
  130. cuDoubleComplex det43 = cuCmul(B[15],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[4*5+4],B[4*5+3],B[1*5+4]),
  131. cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[4*5+4],B[4*5+2],B[1*5+4]),
  132. cuC_5pos(B[0*5+4],B[1*5+2],B[4*5+3],B[4*5+2],B[1*5+3]) ) ) ) );
  133. cuDoubleComplex det44 = cuCmul(B[15],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[2*5+4],B[2*5+3],B[1*5+4]),
  134. cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[2*5+4],B[2*5+2],B[1*5+4]),
  135. cuC_5pos(B[0*5+4],B[1*5+2],B[2*5+3],B[1*5+3],B[2*5+2]) ) ) ) );
  136. det4 = cuCasas(det41, det42, det43, det44);
  137. cuDoubleComplex det51 = cuCmul(B[20],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[2*5+3],B[3*5+4],B[2*5+4],B[3*5+3]),
  138. cuCadd( cuC_5neg(B[1*5+3],B[2*5+2],B[3*5+4],B[3*5+2],B[2*5+4]),
  139. cuC_5pos(B[1*5+4],B[2*5+2],B[3*5+3],B[3*5+2],B[2*5+3]) ) ) ) );
  140. cuDoubleComplex det52 = cuCmul(B[20],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[2*5+3],B[3*5+4],B[2*5+4],B[3*5+3]),
  141. cuCadd( cuC_5neg(B[0*5+3],B[2*5+2],B[3*5+4],B[3*5+2],B[2*5+4]),
  142. cuC_5pos(B[0*5+4],B[2*5+2],B[3*5+3],B[3*5+2],B[2*5+3]) ) ) ) );
  143. cuDoubleComplex det53 = cuCmul(B[20],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[3*5+4],B[3*5+3],B[1*5+4]),
  144. cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[3*5+4],B[3*5+2],B[1*5+4]),
  145. cuC_5pos(B[0*5+4],B[1*5+2],B[3*5+3],B[3*5+2],B[1*5+3]) ) ) ) );
  146. cuDoubleComplex det54 = cuCmul(B[20],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[2*5+4],B[2*5+3],B[1*5+4]),
  147. cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[2*5+4],B[2*5+2],B[1*5+4]),
  148. cuC_5pos(B[0*5+4],B[1*5+2],B[2*5+3],B[1*5+3],B[2*5+2]) ) ) ) );
  149. det5 = cuCasas(det51, det52, det53, det54);
  150. det = cuCasasa(det1,det2,det3,det4,det5);
  151. return det;
  152. }
  153. __device__ cuDoubleComplex det_4x4(cuDoubleComplex *matrix) {
  154. cuDoubleComplex det = make_cuDoubleComplex(0.0,0.0);
  155. cuDoubleComplex det1 = make_cuDoubleComplex(0.0,0.0);
  156. cuDoubleComplex det2 = make_cuDoubleComplex(0.0,0.0);
  157. cuDoubleComplex det3 = make_cuDoubleComplex(0.0,0.0);
  158. cuDoubleComplex det4 = make_cuDoubleComplex(0.0,0.0);
  159. // Calculate the determinant using cofactor expansions
  160. det1 = cuCmul(matrix[0],
  161. cuCadd( cuCsub( cuCmul( matrix[5],
  162. cuCsub( cuCmul(matrix[10],matrix[15]),
  163. cuCmul(matrix[11],matrix[14]) ))
  164. ,cuCmul(matrix[6],
  165. cuCsub( cuCmul(matrix[9],matrix[15]),
  166. cuCmul(matrix[11],matrix[13]) ) ) )
  167. ,cuCmul(matrix[7],
  168. cuCsub( cuCmul(matrix[9],matrix[14]),
  169. cuCmul(matrix[10],matrix[13]) ) ) ) );
  170. det2 = cuCmul(matrix[1],
  171. cuCadd( cuCsub( cuCmul( matrix[4],
  172. cuCsub( cuCmul(matrix[10],matrix[15]),
  173. cuCmul(matrix[11],matrix[14]) ))
  174. ,cuCmul(matrix[6],
  175. cuCsub( cuCmul(matrix[8],matrix[15]),
  176. cuCmul(matrix[11],matrix[12]) ) ) )
  177. ,cuCmul(matrix[7],
  178. cuCsub( cuCmul(matrix[8],matrix[14]),
  179. cuCmul(matrix[10],matrix[12]) ) ) ) );
  180. det3 = cuCmul(matrix[2],
  181. cuCadd( cuCsub( cuCmul( matrix[4],
  182. cuCsub( cuCmul(matrix[9],matrix[15]),
  183. cuCmul(matrix[11],matrix[13]) ))
  184. ,cuCmul(matrix[5],
  185. cuCsub( cuCmul(matrix[8],matrix[15]),
  186. cuCmul(matrix[11],matrix[12]) ) ) )
  187. ,cuCmul(matrix[7],
  188. cuCsub( cuCmul(matrix[8],matrix[13]),
  189. cuCmul(matrix[9],matrix[12]) ) ) ) );
  190. det4 = cuCmul(matrix[3],
  191. cuCadd( cuCsub( cuCmul( matrix[4],
  192. cuCsub( cuCmul(matrix[9],matrix[14]),
  193. cuCmul(matrix[10],matrix[13]) ))
  194. ,cuCmul(matrix[5],
  195. cuCsub( cuCmul(matrix[8],matrix[14]),
  196. cuCmul(matrix[10],matrix[12]) ) ) )
  197. ,cuCmul(matrix[6],
  198. cuCsub( cuCmul(matrix[8],matrix[13]),
  199. cuCmul(matrix[9],matrix[12]) ) ) ) );
  200. det = cuCsub( cuCadd(det1,det3), cuCadd(det2,det4) );
  201. return det;
  202. }
  203. __device__ cuDoubleComplex det_5x5(cuDoubleComplex *matrix) {
  204. cuDoubleComplex det = make_cuDoubleComplex(0.0,0.0);
  205. // Calculate the determinant using cofactor expansions
  206. #pragma unroll
  207. for (int col = 0; col < N; col++) {
  208. cuDoubleComplex submatrix[N-1][N-1];
  209. int subrow = 0, subcol = 0;
  210. // Create the submatrix by excluding the current row and column
  211. #pragma unroll
  212. for (int row = 1; row < N; row++) {
  213. #pragma unroll
  214. for (int c = 0; c < N; c++) {
  215. if (c == col) {
  216. continue;
  217. }
  218. submatrix[subrow][subcol] = matrix[row * N + c];
  219. subcol++;
  220. }
  221. subrow++;
  222. subcol = 0;
  223. }
  224. // Calculate the cofactor
  225. cuDoubleComplex det_sign = make_cuDoubleComplex((col % 2 == 0 ? 1 : -1),0.0);
  226. cuDoubleComplex cofactor = cuCmul( cuCmul(det_sign,matrix[col]), det_4x4(&submatrix[0][0]));
  227. // Accumulate the cofactors to calculate the determinant
  228. det = cuCadd(det, cofactor);
  229. }
  230. return det;
  231. }
  232. __device__ cuDoubleComplex gpu_device_kernel_original(int i) {
  233. // some random data, depending on `i`, but for simplification say constant data
  234. cuDoubleComplex matrix[5*5] = {
  235. make_cuDoubleComplex(2.0*(i/69120 * 1000), 2.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(7.0, 0.0),
  236. make_cuDoubleComplex(4.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(8.0, 0.0),
  237. make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(3.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0),
  238. make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(2.0, 0.0),
  239. make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(6.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 2.0)
  240. };
  241. cuDoubleComplex det = det_5x5(matrix);
  242. return det;
  243. }
  244. __device__ cuDoubleComplex gpu_device_kernel_direct(int i) {
  245. // some random data, depending on `i`, but for simplification say constant data
  246. cuDoubleComplex matrix[5*5] = {
  247. make_cuDoubleComplex(2.0*(i/69120 * 1000), 2.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(7.0, 0.0),
  248. make_cuDoubleComplex(4.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(8.0, 0.0),
  249. make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(3.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0),
  250. make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(2.0, 0.0),
  251. make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(6.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 2.0)
  252. };
  253. cuDoubleComplex det = direct_det5(matrix);
  254. return det;
  255. }
  256. __device__ cuDoubleComplex gpu_device_kernel_generated(int i) {
  257. // some random data, depending on `i`, but for simplification say constant data
  258. cuDoubleComplex matrix[5*5] = {
  259. make_cuDoubleComplex(2.0*(i/69120 * 1000), 2.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(7.0, 0.0),
  260. make_cuDoubleComplex(4.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(8.0, 0.0),
  261. make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(3.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0),
  262. make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(2.0, 0.0),
  263. make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(6.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 2.0)
  264. };
  265. cuDoubleComplex det = generated_det_5x5(matrix);
  266. return det;
  267. }
  268. __global__ void gpu_global_kernel_original(cuDoubleComplex *d_res, int numGpuThreads) {
  269. const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  270. if(tid < numGpuThreads){
  271. // some other computations, irrelevant to current problem
  272. uint64_t total = 69120 * 1000;// 10 * 1000 * 1000; // this `total` is what I am referring to thousands
  273. cuDoubleComplex det_sum, det;
  274. det_sum = make_cuDoubleComplex(0.0,0.0);
  275. for (int i = 0; i < total; i++) {
  276. det = gpu_device_kernel_original(i);
  277. det_sum = cuCadd(det_sum, det);
  278. }
  279. det_sum = cuCmul(det_sum, make_cuDoubleComplex(1.0/total,0.0) );
  280. d_res[tid] = det_sum;
  281. }
  282. }
  283. __global__ void gpu_global_kernel_direct(cuDoubleComplex *d_res, int numGpuThreads) {
  284. const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  285. if(tid < numGpuThreads){
  286. // some other computations, irrelevant to current problem
  287. uint64_t total = 69120 * 1000;// 10 * 1000 * 1000; // this `total` is what I am referring to thousands
  288. cuDoubleComplex det_sum, det;
  289. det_sum = make_cuDoubleComplex(0.0,0.0);
  290. for (int i = 0; i < total; i++) {
  291. det = gpu_device_kernel_direct(i);
  292. det_sum = cuCadd(det_sum, det);
  293. }
  294. det_sum = cuCmul(det_sum, make_cuDoubleComplex(1.0/total,0.0) );
  295. d_res[tid] = det_sum;
  296. }
  297. }
  298. __global__ void gpu_global_kernel_generated(cuDoubleComplex *d_res, int numGpuThreads) {
  299. const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  300. if(tid < numGpuThreads){
  301. // some other computations, irrelevant to current problem
  302. uint64_t total = 69120 * 1000;// 10 * 1000 * 1000; // this `total` is what I am referring to thousands
  303. cuDoubleComplex det_sum, det;
  304. det_sum = make_cuDoubleComplex(0.0,0.0);
  305. for (int i = 0; i < total; i++) {
  306. det = gpu_device_kernel_generated(i);
  307. det_sum = cuCadd(det_sum, det);
  308. }
  309. det_sum = cuCmul(det_sum, make_cuDoubleComplex(1.0/total,0.0) );
  310. d_res[tid] = det_sum;
  311. }
  312. }
  313. int main(int argc, char *argv[]) {
  314. if (argc < 2) {
  315. printf("Usage: %s <method> [<numGpuThreads>]\n", argv[0]);
  316. printf("method: 0 = original, 1 = direct, 2 = generated(no explicit cucomplex)\n");
  317. return 1;
  318. }
  319. int method;
  320. method = atoi(argv[1]);
  321. int numGpuThreads = 1;
  322. if(argc > 2){
  323. numGpuThreads = atoi(argv[2]);
  324. }
  325. cuDoubleComplex *res;
  326. res = (cuDoubleComplex*)malloc(sizeof(cuDoubleComplex) * numGpuThreads);
  327. memset(res, 0.0,sizeof(cuDoubleComplex) * numGpuThreads);
  328. cuDoubleComplex *d_res;
  329. cudaMalloc(&d_res, sizeof(cuDoubleComplex) * numGpuThreads );
  330. clock_t direct_start, direct_end;
  331. direct_start = clock();
  332. dim3 block = 128;
  333. dim3 grid = (numGpuThreads + block.x -1) / block.x;
  334. switch(method){
  335. case 0: printf("original\n"); gpu_global_kernel_original<<<grid,block>>>(d_res, numGpuThreads); break;
  336. case 1: printf("direct\n"); gpu_global_kernel_direct<<<grid,block>>>(d_res, numGpuThreads); break;
  337. case 2: printf("generated\n"); gpu_global_kernel_generated<<<grid,block>>>(d_res, numGpuThreads); break;
  338. }
  339. cudaDeviceSynchronize();
  340. direct_end = clock();
  341. //just copy the first result, not all numGpuThreads results
  342. cudaMemcpy(res, d_res, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost );
  343. printf("RESULT: %.16E + %.16E i\n", cuCreal(res[0]), cuCimag(res[0]));
  344. double timing = (double)((double)(direct_end - direct_start) / CLOCKS_PER_SEC);
  345. printf("time %.5f s\n", timing);
  346. cudaDeviceReset();
  347. return 0;
  348. }

huangapple
  • 本文由 发表于 2023年5月28日 22:03:36
  • 转载请务必保留本文链接:https://go.coder-hub.com/76351879.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定