2023年6月12日 02:03:10go评论68阅读模式

英文:

Sieve eratosthenes parallelisation in c

问题

The code you provided is in C and appears to implement the Sieve of Eratosthenes algorithm with parallelization. Here's the translation of the specific code snippet you mentioned:

如果 (!(a[i / 64] & ((uint_fast64_t)1 << (i % 64))))       
    continue;

a[j / 64] &= ~(((uint_fast64_t)1) << (j % 64));

Translation:

if (!(a[i / 64] & ((uint_fast64_t)1 << (i % 64))))       
    continue;

a[j / 64] &= ~(((uint_fast64_t)1) << (j % 64));

In this code:

a[i / 64] and a[j / 64] are arrays of 64-bit integers, and i and j are loop variables.
i / 64 and j / 64 are used to determine the index of the 64-bit integer element in the array a that corresponds to the current value of i and j.
(i % 64) and (j % 64) are used to determine the bit position within the selected 64-bit integer where the operation is performed.
& is the bitwise AND operator, and << is the bitwise left shift operator.
The code checks and manipulates specific bits within the 64-bit integers in the array a. The if condition checks if a specific bit at position (i % 64) in a[i / 64] is not set (equals 0). The a[j / 64] &= ~(((uint_fast64_t)1) << (j % 64)); line clears (sets to 0) a specific bit at position (j % 64) in a[j / 64].

This code is a crucial part of the Sieve of Eratosthenes algorithm and is used to mark and sieve out prime numbers.

英文:

I just had the solution for Sieve of Eratosthenes parallelisation in C from my teacher. What do these lines mean ?

if (!(a[i / 64] &amp; ((uint_fast64_t)1 &lt;&lt; (i % 64))))       
    continue;

a[j / 64] &amp;= ~(((uint_fast64_t)1) &lt;&lt; (j % 64));

Full code:

#include &lt;math.h&gt;
#include &lt;pthread.h&gt;
#include &lt;stdint.h&gt;
#include &lt;stdio.h&gt;
#include &lt;time.h&gt;
#include &lt;unistd.h&gt;

#pragma GCC optimize(&quot;unroll-loops&quot;)
#define N 100
#define K 7

uint_fast64_t a[N];

void *threadBehaviour(void *) {
    static uint16_t k = 0;
    k = (k + 1) % K;
    uint_fast64_t nb_max_i;
    for (uint_fast64_t i = 2; i &lt;= sqrt(N); i += 1) {
        if (!(a[i/64 ] &amp; ((uint_fast64_t)1 &lt;&lt; (i %64 ))))
            continue;
        nb_max_i = ((N) - (i * i)) / i;
        uint_fast64_t exitCondition = i * i + (nb_max_i * (k + 1) / K) * i;
        for (uint_fast64_t j = i * i + (nb_max_i * k / K) * i; j &lt;= exitCondition; j += i) {
            a[j / 64] &amp;= ~(((uint_fast64_t)1) &lt;&lt; (j % 64));
        }
    }
    pthread_exit(NULL);
}

int main() {
    pthread_t k[K];
    for (uint_fast64_t i = 0; i &lt; N; i++) {
        a[i] = 0xAAAAAAAAAAAAAAAA;
    }
    for (uint16_t u = 0; u &lt; K; u++) {
        pthread_create(&amp;k[u], NULL, &amp;threadBehaviour, NULL);
    }
    clock_t t;
    t = clock();
    for (uint16_t j = 0; j &lt; K; j++) {
        pthread_join(k[j], NULL);
    }
    t = clock() - t;
    double time_taken = ((double)t) / CLOCKS_PER_SEC;
    printf(&quot;%f seconds&quot;, time_taken);

    for (uint_fast64_t i = 2; i &lt; (N+1); i++) {
        if (a[i / 64] &amp; ((uint_fast64_t)1 &lt;&lt; i)) {
            printf(&quot;| %lu &quot;, i);
        }
    }
    printf(&quot;\n&quot;);
    return 0;
}

especially for a[i/64] I don't understand what this does

答案1

得分: 4

以下是您提供的代码的翻译部分：

测试 `(a[i / 64] &amp; ((uint_fast64_t)1 &lt;&lt; (i % 64)))` 检查全局数组 `a` 中的元素中是否设置了某个位。位数是 `i` 的低 6 位，而元素编号是 `i` 的高位。外部循环的这一初始部分选择下一个素数，将其倍数标记为合数，剩余的数组（或由此线程分配的切片）。

类似地，`a[j / 64] &amp;= ~(((uint_fast64_t)1) &lt;&lt; (j % 64));` 清除了索引 `j` 对应的位，将 `j` 标记为合数。

此实现创建多个线程，每个线程更新数组的不同部分。但这种方法存在重大问题：

线程函数开头的这些行非常糟糕：

    static uint16_t k = 0;
    k = (k + 1) % K;

其目的是确定当前线程修改的全局数组的哪一部分。然而，`static` 变量 `k` 在没有同步机制的情况下被并发线程修改，这种修改不是原子的，因此不能保证每个线程都会获得不同的 `k` 值。您应该分配一个包含适当信息的结构，并将指针作为不透明参数传递，从而消除全局或 `static` 变量的需要。

这种方法中还存在另一个重大问题：所有线程都访问全局数组的开头，而第一个线程在并发修改它。这至少是不美观的，潜在地是不正确的。

还不清楚为什么数组有 `N` 个元素以计算小于 `N` 的素数，`N / 64` 应该足够。

`j` 的初始值和最大值的表达式过于复杂且难以验证。我几乎可以肯定，`j` 的最后一个值在某些情况下可能导致未定义行为。

还要注意，`clock()` 用于测量 CPU 时间，而不是经过的时间。您应该使用 `timespec_get()`、`gettimeofday()` 或其他精确的时间函数。

以下是修改后的版本：

#include <math.h>
#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <sys/time.h>

//#pragma GCC optimize("unroll-loops")
struct sieve_args {
uint64_t *a;
uint64_t maxp, begin, end;
};

void *threadBehaviour(void *opaque) {
struct sieve_args *args = opaque;
uint64_t *a = args->a;

for (uint64_t p = 3; p &lt;= args-&gt;maxp; p += 2) {
if (!(a &amp; ((uint64_t)1 &lt;&lt; (p % 64))))
continue;
/* 清除大于等于 p * p 的奇数倍数，将其标记为合数 */
uint64_t start = (args-&gt;begin + p - 1) / p * p;
if (start &lt; p * p)
start = p * p;
if (start % 2 == 0)
start += p;
uint64_t stop = args-&gt;end;
for (uint64_t j = start; j &lt; stop; j += p + p) {
a[j / 64] &amp;= ~((uint64_t)1 &lt;&lt; (j % 64));
}
}
pthread_exit(NULL);

}

/* 在多线程程序中不能使用 clock() 来测量经过的时间 */
static uint64_t clock_us(void) {
#ifdef TIME_UTC
struct timespec ts;
timespec_get(&ts, TIME_UTC);
return (uint64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
#else
struct timeval tt;
gettimeofday(&tt, NULL);
return (uint64_t)tt.tv_sec * 1000000 + tt.tv_usec;
#endif
}

/* 计算不超过 N 的素数个数 */
int main(int argc, char *argv[]) {

uint64_t N = argc &gt; 1 ? strtoull(argv[1], NULL, 0) : 1000000;
int K = argc &gt; 2 ? strtoul(argv[2], NULL, 0) : 7;
/* uint64_t 数组的大小 */
size_t NW = N / 64 + 1;
pthread_t tid[K];
struct sieve_args args[K];
/* 分配至少 N+1 位 */
uint64_t *a = calloc(sizeof(*a), NW);
uint64_t t = clock_us();
/* 使用偶数初始化数组，标记为合数 */
for (size_t i = 0; i &lt; NW; i++) {
a[i] = 0xAAAAAAAAAAAAAAAA;
}
for (int k = 0; k &lt; K; k++) {
args[k].a = a;
args[k].maxp = (uint64_t)+sqrt(N);
args[k].begin = 64 * ((uint64_t)NW * k / K);
args[k].end = 64 * ((uint64_t)NW * (k + 1) / K);
pthread_create(&amp;tid[k], NULL, threadBehaviour, &amp;args[k]);
}
for (int k = 0; k &lt; K; k++) {
pthread_join(tid[k], NULL);
}
t = clock_us() - t;
printf(&quot;%f ms\n&quot;, t / 1000.0);
unsigned long long count = 0;
/* 计算素数个数 */
a[0] &amp;= ~(1 &lt;&lt; 1); /* 将 1 标记为非素数 */
a[0] |= 1 &lt;&lt; 2; /* 将 2 标记为素数 */

英文:

The test (a[i / 64] & ((uint_fast64_t)1 << (i % 64))) checks if a bit is set in an element of the global array a. The bit number is the low 6 bits of i and the element number is the higher bits of i. This initial part of the outer loop selects the next prime number whose multiples to flag as composite in the rest of the array (or the slice thereof assigned to this thread).

Similarly a[j / 64] &= ~(((uint_fast64_t)1) << (j % 64)); clears the corresponding bit for index j, marking j as a composite number.

This implementation creates multiple threads, each of which updates a different portion of the array. Yet there are significant issues in this approach:

These lines at the beginning of the thread function are absolutely horrible:

static uint16_t k = 0;
k = (k + 1) % K;

The purpose is to determine which part of the global array is modified by the current thread. Yet the static variable k is modified by concurrent threads without a synchronisation mechanism and this modification is not atomic so there is no guarantee every thread will get a different value of k. You should instead allocate a structure with the appropriate information and pass a pointer as the opaque argument, removing the need for global or static variables.

There is another major problem in this approach: all threads access the beginning of the global array, that is modified concurrently by the first thread. This is ugly at best and potentially incorrect.

It is also unclear why the array has N elements to compute prime numbers below N, N / 64 should suffice.

The expressions for the initial and maximum values of j are too complicated and difficult to verify. I am almost certain the last value of j may cause undefined behavior in some cases.

Also note that clock() measures cpu time, not elapsed time. You should use timespec_get(), gettimeofday() or another precise time function.

Here is a modified version:

#include &lt;math.h&gt;
#include &lt;pthread.h&gt;
#include &lt;stdint.h&gt;
#include &lt;stdio.h&gt;
#include &lt;stdlib.h&gt;
#include &lt;time.h&gt;
#include &lt;unistd.h&gt;
#include &lt;sys/time.h&gt;
//#pragma GCC optimize(&quot;unroll-loops&quot;)
struct sieve_args {
uint64_t *a;
uint64_t maxp, begin, end;
};
void *threadBehaviour(void *opaque) {
struct sieve_args *args = opaque;
uint64_t *a = args-&gt;a;
for (uint64_t p = 3; p &lt;= args-&gt;maxp; p += 2) {
if (!(a &amp; ((uint64_t)1 &lt;&lt; (p % 64))))
continue;
/* clear odd multiples of p greater or equal to p * p
in the slice as composites */
uint64_t start = (args-&gt;begin + p - 1) / p * p;
if (start &lt; p * p)
start = p * p;
if (start % 2 == 0)
start += p;
uint64_t stop = args-&gt;end;
for (uint64_t j = start; j &lt; stop; j += p + p) {
a[j / 64] &amp;= ~((uint64_t)1 &lt;&lt; (j % 64));
}
}
pthread_exit(NULL);
}
/* cannot use clock() to measure elapsed time in a multi-threaded program */
static uint64_t clock_us(void) {
#ifdef TIME_UTC
struct timespec ts;
timespec_get(&amp;ts, TIME_UTC);
return (uint64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
#else
struct timeval tt;
gettimeofday(&amp;tt, NULL);
return (uint64_t)tt.tv_sec * 1000000 + tt.tv_usec;
#endif
}
/* count prime numbers up to and including N */
int main(int argc, char *argv[]) {
uint64_t N = argc &gt; 1 ? strtoull(argv[1], NULL, 0) : 1000000;
int K = argc &gt; 2 ? strtoul(argv[2], NULL, 0) : 7;
/* size of the array of uint64_t */
size_t NW = N / 64 + 1;
pthread_t tid[K];
struct sieve_args args[K];
/* allocate at least N+1 bits */
uint64_t *a = calloc(sizeof(*a), NW);
uint64_t t = clock_us();
/* initialize a with all even numbers composite */
for (size_t i = 0; i &lt; NW; i++) {
a[i] = 0xAAAAAAAAAAAAAAAA;
}
for (int k = 0; k &lt; K; k++) {
args[k].a = a;
args[k].maxp = (uint64_t)+sqrt(N);
args[k].begin = 64 * ((uint64_t)NW * k / K);
args[k].end = 64 * ((uint64_t)NW * (k + 1) / K);
pthread_create(&amp;tid[k], NULL, threadBehaviour, &amp;args[k]);
}
for (int k = 0; k &lt; K; k++) {
pthread_join(tid[k], NULL);
}
t = clock_us() - t;
printf(&quot;%f ms\n&quot;, t / 1000.0);
unsigned long long count = 0;
/* count prime numbers */
a[0] &amp;= ~(1 &lt;&lt; 1); /* set 1 as not prime */
a[0] |= 1 &lt;&lt; 2; /* set 2 as prime */
for (uint64_t i = 2; i &lt;= N; i++) {
if (a[i / 64] &amp; ((uint64_t)1 &lt;&lt; (i % 64))) {
//printf(&quot;| %llu &quot;, i);
count++;
}
}
//printf(&quot;\n&quot;);
printf(&quot;%llu: %llu primes\n&quot;, (unsigned long long)N, count);
free(a);
return 0;
}

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

Sieve Eratosthenes在C中的并行化

问题

答案1

如何告诉 CMake 在链接命令中不传递重复的库

获取Linux监视器分辨率使用C

Is it possible, in C, to write in one line a call to a function that has an array of strings (ie ptr) or int, or … as parameter?

FreeRTOS堆实现是否违反了C语言别名规则？

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

发表评论