英文:
How can I optimize this bit-packing function in C?
问题
这个代码示例接受任意数量的字节作为输入,一次处理一个字节,并将它们映射到一个包含32个值的表[0,31]中。为了简化示例,我使用了模32运算。">> 3" 操作相当于除以8。如果不清楚的话,“q”循环计数到5,因为0到31的数字只需要5位分辨率。
在速度方面是否可以改进?
uint8_t sto[10000];
uint8_t srt[32] = {<32个值总共>};
uint8_t t,q;
uint64_t loop,cr = 0;
for(loop = 0; loop < 10000; loop++) {
t = srt[loop % 32];
for(q = 0; q < 5; q++) {
if(t & (0x1 << (4 - q)))
sto[cr >> 3] |= (0x1 << (7 - (cr % 8)));
cr++;
}
}
英文:
This code sample takes an arbitrarily large number of bytes as inputs, one at the time, and maps them to a table with 32 values [0,31]. To simplify the example I used mod 32. The >> 3 operation is equivalent to division by 8. In case it is not clear, the "q" loop counts to 5 because numbers from 0 to 31 only require 5-bits of resolution.
Can this be improved in terms of speed?
uint8_t sto[10000];
uint8_t srt[32] = {<32 values in total>};
uint8_t t,q
uint64_t loop,cr = 0;
for(loop = 0; loop < 10000; loop++) {
t = srt[loop % 32];
for(q = 0; q < 5; q++) {
if(t & (0x1 << (4 - q)))
sto[cr >> 3] |= (0x1 << (7 - (cr % 8)));
cr++;
}
}
答案1
得分: 3
以下是您提供的内容的中文翻译:
更好的优化可能需要更大的代码和定时测试工具。候选的简化(都是次要的)。
t
从未被使用。删掉它
删掉代码 t = srt[loop % 32];
。
// t = srt[loop % 32];
@affluentbarnburner,你是不是指的是 if(t & (0x1 << (4 - q)))
而不是 if(srt[ord] & (0x1 << (4 - q)))
?
形成一个 掩码
//for(q = 0; q < 5; q++) {
// if(srt[ord] & (0x1 << (4 - q)))
for (unsigned mask = 1 << 4; mask; mask >>= 1) {
if (srt[ord] & mask)
使用 size_t
进行数组索引
// uint64_t loop;
size_t loop;
for(loop = 0; loop < largeNumber; loop++) {
使用字节和位索引,而不是合并的索引
分离(较窄的)索引 可能 更快。
// uint64_t loop,cr = 0;
size_t loop, cr_byte = 0;
unsigned cr_bit_mask = 0x80;
for(loop = 0; loop < largeNumber; loop++) {
// t = srt[loop % 32];
for (unsigned mask = 1 << 4; mask; mask >>= 1) {
if(srt[ord] & mask) {
sto[cr_byte] |= cr_bit_mask;
}
cr_bit_mask >>= 1;
if (cr_bit_mask == 0) {
cr_bit_mask = 0x80;
cr_byte++;
}
}
}
英文:
Better optimization possible with larger code and a timing test harness.
Candidate simplifications (All minor).
t
never used. Drop it
Drop code t = srt[loop % 32];
.
// t = srt[loop % 32];
@affluentbarnburner, did you mean if(t & (0x1 << (4 - q)))
instead of if(srt[ord] & (0x1 << (4 - q)))
?
Form a mask
//for(q = 0; q < 5; q++) {
// if(srt[ord] & (0x1 << (4 - q)))
for (unsigned mask = 1 << 4; mask; mask >>= 1) {
if (srt[ord] & mask)
Use size_t
for array indexing
// uint64_t loop;
size_t loop;
for(loop = 0; loop < largeNumber; loop++) {
Use byte and bit indexes rather than a combined one
Separate (narrower) indexes may be faster.
// uint64_t loop,cr = 0;
size_t loop, cr_byte = 0;
unsigned cr_bit_mask = 0x80;
for(loop = 0; loop < largeNumber; loop++) {
// t = srt[loop % 32];
for (unsigned mask = 1 << 4; mask; mask >>= 1) {
if(srt[ord] & mask) {
sto[cr_byte] |= cr_bit_mask;
}
cr_bit_mask >>= 1;
if (cr_bit_mask == 0) {
cr_bit_mask = 0x80;
cr_byte++;
}
}
}
答案2
得分: 1
这段代码可以简化为:
for (size_t i = 0; i < 10000; ++i)
sto[i] |= bits[i % NBytes];
在对bits
进行一些准备工作后。这个循环可以展开,可以转换为比uint8_t
更宽的单元,可以转换为SIMD代码,并且可以并行化。在进行了一些优化,比如转换为更宽的单元后,可能会达到内存带宽,因此进一步的优化可能没有用处。
准备bits
的工作是将srt
中的位合并成完整的位掩码:
#define NSrt 32 // srt中的元素数量。
#define NBits (NSrt*5) // 从srt中使用的位数。
#define NBytes (NBits/8) // NBits使用的字节数。
/* 将srt元素的位合并为字节,每次迭代处理srt的8个元素(i)和bits的5个元素(j)。 */
uint8_t bits[NBytes];
for (size_t i = 0, j = 0; i < NSrt; i += 8, j += 5)
{
/* 从每个srt元素获取五位。(b0不需要被屏蔽,因为它的高位将从uint8_t的范围内移出。) */
uint8_t
b0 = srt[i + 0],
b1 = srt[i + 1] & 0x1f,
b2 = srt[i + 2] & 0x1f,
b3 = srt[i + 3] & 0x1f,
b4 = srt[i + 4] & 0x1f,
b5 = srt[i + 5] & 0x1f,
b6 = srt[i + 6] & 0x1f,
b7 = srt[i + 7] & 0x1f;
// 将五位段合并为八位单元。
bits[j + 0] = b0 << 3 | b1 >> 2;
bits[j + 1] = b1 << 6 | b2 << 1 | b3 >> 4;
bits[j + 2] = b3 << 4 | b4 >> 1;
bits[j + 3] = b4 << 7 | b5 << 2 | b6 >> 3;
bits[j + 4] = b6 << 5 | b7;
}
英文:
The code can be reduced to simply:
for (size_t i = 0; i < 10000; ++i)
sto[i] |= bits[i % NBytes];
after doing some work to prepare bits
. That loop can be unrolled, can be converted to units wider than uint8_t
, can be converted to SIMD code, and can be parallelized. After a few of those, such as converting to wider units, it might stream at memory bandwidth, so further optimization might not be useful.
The work to prepare bits
is to consolidate the bits from srt
into a full bitmask:
#define NSrt 32 // Number of elements in srt.
#define NBits (NSrt*5) // Number of bits used from srt.
#define NBytes (NBits/8) // Number of bytes used by NBits.
/* Consolidate the bits of srt into bytes, processing eight elements of
srt (i) and five elements of bits (j) per iteration.
*/
uint8_t bits[NBytes];
for (size_t i = 0, j = 0; i < NSrt; i += 8, j += 5)
{
/* Get five bits from each srt elements. (b0 does not need to be
masked because its high bits will be shifted out of the uint8_t
span.)
*/
uint8_t
b0 = srt[i + 0],
b1 = srt[i + 1] & 0x1f,
b2 = srt[i + 2] & 0x1f,
b3 = srt[i + 3] & 0x1f,
b4 = srt[i + 4] & 0x1f,
b5 = srt[i + 5] & 0x1f,
b6 = srt[i + 6] & 0x1f,
b7 = srt[i + 7] & 0x1f;
// Merge the five-bit segments into eight-bit units.
bits[j + 0] = b0 << 3 | b1 >> 2;
bits[j + 1] = b1 << 6 | b2 << 1 | b3 >> 4;
bits[j + 2] = b3 << 4 | b4 >> 1;
bits[j + 3] = b4 << 7 | b5 << 2 | b6 >> 3;
bits[j + 4] = b6 << 5 | b7;
}
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论