英文:
Proper way to cast 'threadIdx.x's into higher type in CUDA kernel (%lu format in printf malfunctions in the CUDA kernel?)
问题
我需要访问CUDA内核中一个非常大的数组的元素。在某些应用中,数组的大小可能会超过INT_MAX。
基本上,它们采用以下形式:
__global__ function(double *dArr) {
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
dArr[index * WIDTH] = ...; // WIDTH is 256 or 512.
}
据我了解,CUDA变量如threadIdx.x
是无符号整数unsigned int
,其限制比通常的uint
要小。
我正在尝试将这些CUDA变量转换为更高的类型,以便它们可以用作大数组的索引。
我已经尝试了一些方法,但仍然无法使我的内核与大数组一起正常工作。甚至我无法理解以下简单代码的结果(它甚至没有处理大数字)。
#include <cstdio>
__global__ void printIndex() {
printf("blockIdx.x %lu (%d), blockDim.x %lu (%d), threadIdx.x %lu (%d)\n",
blockIdx.x, blockIdx.x, blockDim.x, blockDim.x, threadIdx.x, threadIdx.x);
//printf("blockIdx.x %d, blockDim.x %d, threadIdx.x %d \n", blockIdx.x, blockDim.x, threadIdx.x); // this works fine.
}
int main() {
printIndex<<<2,64>>>();
cudaDeviceSynchronize();
unsigned int ui = 1000;
printf("ui %lu (%d) \n", ui, ui); // this is just for the comparison.
return 0;
}
我选择%lu
的原因是为了模仿将某种类型强制转换为更高类型的方式,结果相当奇怪。(当我使用%u
时,它正常工作)
在最后一行中,我看到ui
使用%lu
打印正常,而CUDA内核中的打印非常奇怪。首先,当我处理最多64的数字时,%lu
和%d
转换不相同。其次,即使%d
表示也不正确。blockIdx.x
应该是0或1。
我迷失在哪里了?要使用printf
检查threadIdx.x
的内容,正确的方式是什么?如果我要将它们强制转换为可能超过INT_MAX(或UINT_MAX)的更高类型,应该如何进行正确的强制转换?
我添加了C标签,因为这涉及到<cstdio>
中的printf
。
英文:
I have to access elements of a very large array in CUDA kernels. The size of arrays could be above INT_MAX in some applications.
Essentially those are taking the form of the following.
__global__ function(double *dArr) {
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
dArr[index * WIDTH] = ...; // WIDTH is 256 or 512.
}
To my understanding, CUDA variables such as threadIdx.x
are unsigned int
with smaller limits than the usual uint
.
I am trying to cast these CUDA variables into higher type so that they can be used as an index of the large array.
I have tried some, but still my kernels are not working with large arrays. And I cannot even understand the results from the following simple lines of code (it is not even taking large numbers).
#include <cstdio>
__global__ void printIndex() {
printf("blockIdx.x %lu (%d), blockDim.x %lu (%d), threadIdx.x %lu (%d)\n",
blockIdx.x, blockIdx.x, blockDim.x, blockDim.x, threadIdx.x, threadIdx.x);
//printf("blockIdx.x %d, blockDim.x %d, threadIdx.x %d \n", blockIdx.x, blockDim.x, threadIdx.x); // this works fine.
}
int main() {
printIndex<<<2,64>>>();
cudaDeviceSynchronize();
unsigned int ui = 1000;
printf("ui %lu (%d) \n", ui, ui); // this is just for the comparison.
return 0;
}
The reason I chose %lu
is to mimic a certain type of casting to higher type and the result is quite strange. (When I use %u
, it works fine)
blockIdx.x 4294967297 (64), blockDim.x 0 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 4294967297 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 8589934594 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 12884901891 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 17179869188 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 21474836485 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 25769803782 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 30064771079 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 34359738376 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 38654705673 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 42949672970 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 47244640267 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 51539607564 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 55834574861 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 60129542158 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 64424509455 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 68719476752 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 73014444049 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 77309411346 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 81604378643 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 85899345940 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 90194313237 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 94489280534 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 98784247831 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 103079215128 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 107374182425 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 111669149722 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 115964117019 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 120259084316 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 124554051613 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 128849018910 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 133143986207 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 137438953504 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 141733920801 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 146028888098 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 150323855395 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 154618822692 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 158913789989 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 163208757286 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 167503724583 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 171798691880 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 176093659177 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 180388626474 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 184683593771 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 188978561068 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 193273528365 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 197568495662 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 201863462959 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 206158430256 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 210453397553 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 214748364850 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 219043332147 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 223338299444 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 227633266741 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 231928234038 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 236223201335 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 240518168632 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 244813135929 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 249108103226 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 253403070523 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 257698037820 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 261993005117 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 266287972414 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 270582939711 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 0 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 4294967297 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 8589934594 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 12884901891 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 17179869188 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 21474836485 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 25769803782 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 30064771079 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 34359738376 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 38654705673 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 42949672970 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 47244640267 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 51539607564 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 55834574861 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 60129542158 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 64424509455 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 68719476752 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 73014444049 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 77309411346 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 81604378643 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 85899345940 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 90194313237 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 94489280534 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 98784247831 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 103079215128 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 107374182425 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 111669149722 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 115964117019 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 120259084316 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 124554051613 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 128849018910 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 133143986207 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 137438953504 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 141733920801 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 146028888098 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 150323855395 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 154618822692 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 158913789989 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 163208757286 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 167503724583 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 171798691880 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 176093659177 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 180388626474 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 184683593771 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 188978561068 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 193273528365 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 197568495662 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 201863462959 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 206158430256 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 210453397553 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 214748364850 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 219043332147 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 223338299444 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 227633266741 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 231928234038 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 236223201335 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 240518168632 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 244813135929 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 249108103226 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 253403070523 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 257698037820 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 261993005117 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 266287972414 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 270582939711 (0), threadIdx.x 0 (6)
ui 1000 (1000)
At the last line I see that ui
is printed fine with %lu
, while the prints in the CUDA kernel is very strange. First of all, %lu
and %d
casts are not the same when I am dealing with numbers at most 64. Secondly, even %d
representations are not correct. blockIdx.x
's should be 0 or 1.
Where I got lost? To check the threadIdx.x
stuff with printf
, what would be the proper way to do? And if I were to cast those into higher type which can potentially get above INT_MAX (or UINT_MAX), what would be the proper way of cast?
I added c
tag since this is about printf
in <cstdio>
.
答案1
得分: 1
你需要为每个变量使用正确的格式字符串。使用64位值的格式字符串并传递32位值是无效的。(谁知道这样做会进行什么样的越界访问)
编译器应该会警告这个问题。
>参数与相应的格式字符串转换不兼容(期望类型为“unsigned long”,但参数类型为“unsigned int”)
如果你想使用%lu
进行打印,将参数强制转换为size_t
,即(size_t)threadIdx.x
。
要计算全局线程ID作为64位值,可以使用以下方式:
size_t id = size_t(threadIdx.x) + size_t(blockIdx.x) * size_t(blockDim.x);
英文:
You need to use the correct format string for each variable. It is invalid to use the format string for a 64 bit value and pass a 32 bit value. (who knows what kind of out-of-bounds access is performed this way)
The compiler should warn about this.
>argument is incompatible with corresponding format string conversion (expected type "unsigned long" but argument has type "unsigned int")
If you want to print using %lu
, cast the argument to size_t, i.e. (size_t)threadIdx.x.
To compute the global thread id as 64 bit value, one could use
size_t id = size_t(threadIdx.x) + size_t(blockIdx.x) * size_t(blockDim.x);
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论