2023年6月18日 21:58:15go评论92阅读模式

英文:

Numba crashes Python with parallel=True flag set

问题

我正在尝试使用Numba来加速一些计算。调用nnleapfrog_integrate函数导致分段错误并使Python进程崩溃。如果从其jit装饰器中删除parallel=True标志，该函数将正常工作。但是，这将使其在单个线程中运行。我希望这个函数能够尽可能快地运行，因此我希望它在多线程中运行以利用所有CPU核心。

from numba import jit, prange
import numpy as np

@jit(&#39;Tuple((f8[:,:,::1],f8[:,:,::1]))(f8[:,::1], f8[:,::1], f8[::1], i8, i8, i8, f8, f8)&#39;, nopython=True,  parallel=True)
def nnleapfrog_integrate(pos, vel, mass, i_steps, r_steps, dt, G, softening):
	N = pos.shape[0]
	pos_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
	vel_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
	data_idx = 0
	acc = np.zeros((N,3))
	for s in range(i_steps):
		vel += acc * dt/2.0
		pos += vel * dt
		for i in prange(N):
			acc[i,0] =  0
			acc[i,1] =  0
			acc[i,2] =  0
			for j in range(N):
				dx = pos[j,0] - pos[i,0]
				dy = pos[j,1] - pos[i,1]
				dz = pos[j,2] - pos[i,2]
				inv_r3 = (dx**2 + dy**2 + dz**2 + softening**2)**(-1.5)
				acc[i,0] +=  G * (dx * inv_r3) * mass[j]
				acc[i,1] +=  G * (dy * inv_r3) * mass[j]
				acc[i,2] +=  G * (dz * inv_r3) * mass[j]
		vel += acc * dt/2.0
		if s % r_steps == 0:
			pos_data[data_idx] = pos
			vel_data[data_idx] = vel
			data_idx += 1
	return pos_data, vel_data

N = 10
dt = 60
pos = np.random.rand(N, 3)
vel = np.random.rand(N, 3)
m = np.random.rand(N)
softening = 1e3
G = 6.67430e-11
t_max = 3600*24*30
i_steps = int(t_max/dt)
r_steps = int(3600*24/dt)
r_i, v_i = nnleapfrog_integrate(pos, vel, m, i_steps, r_steps, dt, G, softening)

我已经尝试过的内容

因为只有for i in prange(N):循环适合并行化，所以我将它分离到单独的函数getAcc中，该函数在parallel=True标志下运行良好，并利用了所有CPU核心。

from numba import jit, prange
import numpy as np

@jit(&#39;f8[:, ::1](f8[:, ::1], f8[::1], f8, f8)&#39;, nopython=True, parallel=True)
def getAcc( pos, mass, G, softening ):
	N = pos.shape[0]
	a = np.zeros((N,3))
	for i in prange(N):
		for j in range(N):
			dx = pos[j,0] - pos[i,0]
			dy = pos[j,1] - pos[i,1]
			dz = pos[j,2] - pos[i,2]
			inv_r3 = (dx**2 + dy**2 + dz**2 + softening**2)**(-1.5)
			a[i,0] +=  G * (dx * inv_r3) * mass[j]
			a[i,1] +=  G * (dy * inv_r3) * mass[j]
			a[i,2] +=  G * (dz * inv_r3) * mass[j]
	return a

@jit(&#39;Tuple((f8[:,:,::1],f8[:,:,::1]))(f8[:,::1], f8[:,::1], f8[::1], i8, i8, i8, f8, f8)&#39;, nopython=True)
def nleapfrog_integrate(pos, vel, mass, i_steps, r_steps, dt, G, softening):
	N = pos.shape[0]
	pos_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
	vel_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
	data_idx = 0
	acc = getAcc(pos, mass, G, softening)
	for i in range(i_steps):
		vel += acc * dt/2.0
		pos += vel * dt
		acc = getAcc( pos, mass, G, softening )
		vel += acc * dt/2.0
		if i % r_steps == 0:
			pos_data[data_idx] = pos
			vel_data[data_idx] = vel
			data_idx += 1
	return pos_data, vel_data

N = 10
dt = 60
pos = np.random.rand(N, 3)
vel = np.random.rand(N, 3)
m = np.random.rand(N)
softening = 1e3
G = 6.67430e-11
t_max = 3600*24*30
i_steps = int(t_max/dt)
r_steps = int(3600*24/dt)
r_i, v_i = nleapfrog_integrate(pos, vel, m, i_steps, r_steps, dt, G, softening)

但是结果比原始函数中内联的for i in prange(N):循环版本慢了3倍以上。

In [4]: %timeit r_i, v_i = nleapfrog_integrate(pos, vel, m, i_steps, r_steps, dt, G, softening)
8.51 s &#177; 46.4 ms per loop (mean &#177; std. dev. of 7 runs, 1 loop each)

In [5]: %timeit r_i, v_i = nnleapfrog_integrate(pos, vel, m, i_steps, r_steps, dt, G, softening)
2.53

<details>
<summary>英文:</summary>

I&#39;m trying to speed up some calculations using the Numba. The call of the `nnleapfrog_integrate` function lead to segmentation fault and crash of the Python process. The function works fine if the `parallel=True` flag is removed from its jit decorator. But then it runs in single thread. I want this fuction to run as fast as possible, thus I want it to run in multi threads to utilize all the CPU cores.

```lang-python
from numba import jit, prange
import numpy as np

@jit(&#39;Tuple((f8[:,:,::1],f8[:,:,::1]))(f8[:,::1], f8[:,::1], f8[::1], i8, i8, i8, f8, f8)&#39;, nopython=True,  parallel=True)
def nnleapfrog_integrate(pos, vel, mass, i_steps, r_steps, dt, G, softening):
	N = pos.shape[0]
	pos_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
	vel_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
	data_idx = 0
	acc = np.zeros((N,3))
	for s in range(i_steps):
		vel += acc * dt/2.0
		pos += vel * dt
		for i in prange(N):
			acc[i,0] =  0
			acc[i,1] =  0
			acc[i,2] =  0
			for j in range(N):
				dx = pos[j,0] - pos[i,0]
				dy = pos[j,1] - pos[i,1]
				dz = pos[j,2] - pos[i,2]
				inv_r3 = (dx**2 + dy**2 + dz**2 + softening**2)**(-1.5)
				acc[i,0] +=  G * (dx * inv_r3) * mass[j]
				acc[i,1] +=  G * (dy * inv_r3) * mass[j]
				acc[i,2] +=  G * (dz * inv_r3) * mass[j]
		vel += acc * dt/2.0
		if s % r_steps == 0:
			pos_data[data_idx] = pos
			vel_data[data_idx] = vel
			data_idx += 1
	return pos_data, vel_data

N = 10
dt = 60
pos = np.random.rand(N, 3)
vel = np.random.rand(N, 3)
m = np.random.rand(N)
softening = 1e3
G = 6.67430e-11
t_max = 3600*24*30
i_steps = int(t_max/dt)
r_steps = int(3600*24/dt)
r_i, v_i = nnleapfrog_integrate(pos, vel, m, i_steps, r_steps, dt, G, softening)

What I have already tried

Because only the for i in prange(N): loop is suitable for parallelization, so I have separated it to the separate function getAcc which is works fine with the parallel=True flag and utilizes all the CPU cores.

from numba import jit, prange
import numpy as np

@jit(&#39;f8[:, ::1](f8[:, ::1], f8[::1], f8, f8)&#39;, nopython=True, parallel=True)
def getAcc( pos, mass, G, softening ):
	N = pos.shape[0]
	a = np.zeros((N,3))
	for i in prange(N):
		for j in range(N):
			dx = pos[j,0] - pos[i,0]
			dy = pos[j,1] - pos[i,1]
			dz = pos[j,2] - pos[i,2]
			inv_r3 = (dx**2 + dy**2 + dz**2 + softening**2)**(-1.5)
			a[i,0] +=  G * (dx * inv_r3) * mass[j]
			a[i,1] +=  G * (dy * inv_r3) * mass[j]
			a[i,2] +=  G * (dz * inv_r3) * mass[j]
	return a

@jit(&#39;Tuple((f8[:,:,::1],f8[:,:,::1]))(f8[:,::1], f8[:,::1], f8[::1], i8, i8, i8, f8, f8)&#39;, nopython=True)
def nleapfrog_integrate(pos, vel, mass, i_steps, r_steps, dt, G, softening):
	N = pos.shape[0]
	pos_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
	vel_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
	data_idx = 0
	acc = getAcc(pos, mass, G, softening)
	for i in range(i_steps):
		vel += acc * dt/2.0
		pos += vel * dt
		acc = getAcc( pos, mass, G, softening )
		vel += acc * dt/2.0
		if i % r_steps == 0:
			pos_data[data_idx] = pos
			vel_data[data_idx] = vel
			data_idx += 1
	return pos_data, vel_data

N = 10
dt = 60
pos = np.random.rand(N, 3)
vel = np.random.rand(N, 3)
m = np.random.rand(N)
softening = 1e3
G = 6.67430e-11
t_max = 3600*24*30
i_steps = int(t_max/dt)
r_steps = int(3600*24/dt)
r_i, v_i = nleapfrog_integrate(pos, vel, m, i_steps, r_steps, dt, G, softening)

But it turned out to be more than 3 times slower than the single threaded version of the original function in which this cycle was inlined.

In [4]: %timeit r_i, v_i = nleapfrog_integrate(pos, vel, m, i_steps, r_steps, dt, G, softening)
8.51 s &#177; 46.4 ms per loop (mean &#177; std. dev. of 7 runs, 1 loop each)

In [5]: %timeit r_i, v_i = nnleapfrog_integrate(pos, vel, m, i_steps, r_steps, dt, G, softening)
2.53 s &#177; 18.6 ms per loop (mean &#177; std. dev. of 7 runs, 1 loop each)

Therefore for the best performance, I need the original function with the inlined for i in prange(N): loop to run in multi threads.

答案1

得分: 3

i-based循环的并行化效率不高，因为创建和同步线程很昂贵。实际上，在个人电脑上，这种开销通常至少为几十微秒，而在计算服务器上通常会更大，主要是因为额外的核心。问题是i_steps=43200次迭代，因此这种开销将导致几秒钟的时间。使用N=10时，没有足够的工作以有效地使用线程。

此外，请注意，Numba 0.57.0存在一个错误，会导致此代码出现分段错误，因此我不确定是否安全将其并行化。

幸运的是，可以优化串行代码：

x * y**(-1.5)不高效，因为Numba使用昂贵的指数函数pow来计算它。您可以改用x / (y * sqrt(y))。这样做速度明显更快，因为大多数CPU都有一个集成的硬件单元来相对高效地计算平方根和除法。
fastmath选项未启用，因此Numba不能假设x*y等于y*x，从而阻止了一些优化。启用此标志可能会有风险，但可以在内部循环中通过预先计算值来手动进行优化。

以下是优化后的代码：

@jit(&#39;Tuple((f8[:,:,::1],f8[:,:,::1]))(f8[:,::1], f8[:,::1], f8[::1], i8, i8, i8, f8, f8)&#39;, nopython=True)
def nnleapfrog_integrate(pos, vel, mass, i_steps, r_steps, dt, G, softening):
    N = pos.shape[0]
    pos_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
    vel_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
    data_idx = 0
    acc = np.zeros((N,3))
    for s in range(i_steps):
        vel += acc * dt/2.0
        pos += vel * dt
        for i in range(N):
            acc[i,0] = 0.0
            acc[i,1] = 0.0
            acc[i,2] = 0.0
            for j in range(N):
                dx = pos[j,0] - pos[i,0]
                dy = pos[j,1] - pos[i,1]
                dz = pos[j,2] - pos[i,2]
                tmp1 = dx**2 + dy**2 + dz**2 + softening**2
                tmp2 = G * mass[j] / (tmp1 * np.sqrt(tmp1))
                acc[i,0] += tmp2 * dx
                acc[i,1] += tmp2 * dy
                acc[i,2] += tmp2 * dz
        vel += acc * dt/2.0
        if s % r_steps == 0:
            pos_data[data_idx] = pos
            vel_data[data_idx] = vel
            data_idx += 1
    return pos_data, vel_data

在我的i5-9600KF处理器上，这段代码大约快了5倍。它运行时间约为31毫秒。这意味着包含循环的每次迭代只需要0.72微秒（远小于线程创建/同步的开销）。

进一步的优化包括预计算G * mass[j]并使用SIMD指令计算除法/平方根。前者很容易实现，后者有点棘手，特别是在Numba中。

英文:

The parallelisation of the i-based loop is not efficient because creating and synchronizing threads is expensive. Indeed, this overhead is usually at least dozens of microseconds on PC and often significantly even bigger on computing server (mainly because of the additional cores). The thing is there is i_steps=43200 iteration so this overhead will result in few seconds. There is not enough work to use threads efficiently with N=10.

Besides, note that there is a bug in Numba 0.57.0 causing a segmentation fault on this code so I am not sure this is even safe to parallelize it.

Fortunately, the serial code can be optimized:

x * y**(-1.5) is not efficient because Numba use the expensive exponential function pow to compute it. You can use x / (y * sqrt(y)) instead. This is significantly faster because most CPUs have an integrated hardware unit to compute square root and division relatively efficiently.
The fastmath option is not enabled so Numba cannot assume that x*y is equal to y*x preventing some optimizations. Enabling this flag can be dangerous, but the optimization can be done manually by pre-computing values in the inner loop.

Here is the resulting optimized code:

@jit(&#39;Tuple((f8[:,:,::1],f8[:,:,::1]))(f8[:,::1], f8[:,::1], f8[::1], i8, i8, i8, f8, f8)&#39;, nopython=True)
def nnleapfrog_integrate(pos, vel, mass, i_steps, r_steps, dt, G, softening):
    N = pos.shape[0]
    pos_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
    vel_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
    data_idx = 0
    acc = np.zeros((N,3))
    for s in range(i_steps):
        vel += acc * dt/2.0
        pos += vel * dt
        for i in range(N):
            acc[i,0] = 0.0
            acc[i,1] = 0.0
            acc[i,2] = 0.0
            for j in range(N):
                dx = pos[j,0] - pos[i,0]
                dy = pos[j,1] - pos[i,1]
                dz = pos[j,2] - pos[i,2]
                tmp1 = dx**2 + dy**2 + dz**2 + softening**2
                tmp2 = G * mass[j] / (tmp1 * np.sqrt(tmp1))
                acc[i,0] += tmp2 * dx
                acc[i,1] += tmp2 * dy
                acc[i,2] += tmp2 * dz
        vel += acc * dt/2.0
        if s % r_steps == 0:
            pos_data[data_idx] = pos
            vel_data[data_idx] = vel
            data_idx += 1
    return pos_data, vel_data

This code is about 5 times faster on my machine with a i5-9600KF processor. It runs in approximately 31 ms. This means every iteration of the encompassing loop takes only 0.72 µs (far smaller than the overhead of thread creation/synchronization).

Further optimizations include pre-computing G * mass[j] and computing the division/sqrt using SIMD instruction. The former is easy to do and the later is a bit tricky, especially in Numba.

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

Numba 使用 parallel=True 标志导致 Python 崩溃。

问题

我已经尝试过的内容

What I have already tried

答案1

理解 Java 中的 InheritableThreadLocal

如何同时使UDP服务器和客户端监听传入和传出数据包。

如何在 discord.py v2 中为斜杠命令添加选项？

执行使用Python执行ffmpeg命令来定位*.png文件失败。

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

发表评论