英文:
Numba crashes Python with parallel=True flag set
问题
我正在尝试使用Numba来加速一些计算。调用nnleapfrog_integrate
函数导致分段错误并使Python进程崩溃。如果从其jit装饰器中删除parallel=True
标志,该函数将正常工作。但是,这将使其在单个线程中运行。我希望这个函数能够尽可能快地运行,因此我希望它在多线程中运行以利用所有CPU核心。
from numba import jit, prange
import numpy as np
@jit('Tuple((f8[:,:,::1],f8[:,:,::1]))(f8[:,::1], f8[:,::1], f8[::1], i8, i8, i8, f8, f8)', nopython=True, parallel=True)
def nnleapfrog_integrate(pos, vel, mass, i_steps, r_steps, dt, G, softening):
N = pos.shape[0]
pos_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
vel_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
data_idx = 0
acc = np.zeros((N,3))
for s in range(i_steps):
vel += acc * dt/2.0
pos += vel * dt
for i in prange(N):
acc[i,0] = 0
acc[i,1] = 0
acc[i,2] = 0
for j in range(N):
dx = pos[j,0] - pos[i,0]
dy = pos[j,1] - pos[i,1]
dz = pos[j,2] - pos[i,2]
inv_r3 = (dx**2 + dy**2 + dz**2 + softening**2)**(-1.5)
acc[i,0] += G * (dx * inv_r3) * mass[j]
acc[i,1] += G * (dy * inv_r3) * mass[j]
acc[i,2] += G * (dz * inv_r3) * mass[j]
vel += acc * dt/2.0
if s % r_steps == 0:
pos_data[data_idx] = pos
vel_data[data_idx] = vel
data_idx += 1
return pos_data, vel_data
N = 10
dt = 60
pos = np.random.rand(N, 3)
vel = np.random.rand(N, 3)
m = np.random.rand(N)
softening = 1e3
G = 6.67430e-11
t_max = 3600*24*30
i_steps = int(t_max/dt)
r_steps = int(3600*24/dt)
r_i, v_i = nnleapfrog_integrate(pos, vel, m, i_steps, r_steps, dt, G, softening)
我已经尝试过的内容
因为只有for i in prange(N):
循环适合并行化,所以我将它分离到单独的函数getAcc
中,该函数在parallel=True
标志下运行良好,并利用了所有CPU核心。
from numba import jit, prange
import numpy as np
@jit('f8[:, ::1](f8[:, ::1], f8[::1], f8, f8)', nopython=True, parallel=True)
def getAcc( pos, mass, G, softening ):
N = pos.shape[0]
a = np.zeros((N,3))
for i in prange(N):
for j in range(N):
dx = pos[j,0] - pos[i,0]
dy = pos[j,1] - pos[i,1]
dz = pos[j,2] - pos[i,2]
inv_r3 = (dx**2 + dy**2 + dz**2 + softening**2)**(-1.5)
a[i,0] += G * (dx * inv_r3) * mass[j]
a[i,1] += G * (dy * inv_r3) * mass[j]
a[i,2] += G * (dz * inv_r3) * mass[j]
return a
@jit('Tuple((f8[:,:,::1],f8[:,:,::1]))(f8[:,::1], f8[:,::1], f8[::1], i8, i8, i8, f8, f8)', nopython=True)
def nleapfrog_integrate(pos, vel, mass, i_steps, r_steps, dt, G, softening):
N = pos.shape[0]
pos_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
vel_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
data_idx = 0
acc = getAcc(pos, mass, G, softening)
for i in range(i_steps):
vel += acc * dt/2.0
pos += vel * dt
acc = getAcc( pos, mass, G, softening )
vel += acc * dt/2.0
if i % r_steps == 0:
pos_data[data_idx] = pos
vel_data[data_idx] = vel
data_idx += 1
return pos_data, vel_data
N = 10
dt = 60
pos = np.random.rand(N, 3)
vel = np.random.rand(N, 3)
m = np.random.rand(N)
softening = 1e3
G = 6.67430e-11
t_max = 3600*24*30
i_steps = int(t_max/dt)
r_steps = int(3600*24/dt)
r_i, v_i = nleapfrog_integrate(pos, vel, m, i_steps, r_steps, dt, G, softening)
但是结果比原始函数中内联的for i in prange(N):
循环版本慢了3倍以上。
In [4]: %timeit r_i, v_i = nleapfrog_integrate(pos, vel, m, i_steps, r_steps, dt, G, softening)
8.51 s ± 46.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [5]: %timeit r_i, v_i = nnleapfrog_integrate(pos, vel, m, i_steps, r_steps, dt, G, softening)
2.53
<details>
<summary>英文:</summary>
I'm trying to speed up some calculations using the Numba. The call of the `nnleapfrog_integrate` function lead to segmentation fault and crash of the Python process. The function works fine if the `parallel=True` flag is removed from its jit decorator. But then it runs in single thread. I want this fuction to run as fast as possible, thus I want it to run in multi threads to utilize all the CPU cores.
```lang-python
from numba import jit, prange
import numpy as np
@jit('Tuple((f8[:,:,::1],f8[:,:,::1]))(f8[:,::1], f8[:,::1], f8[::1], i8, i8, i8, f8, f8)', nopython=True, parallel=True)
def nnleapfrog_integrate(pos, vel, mass, i_steps, r_steps, dt, G, softening):
N = pos.shape[0]
pos_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
vel_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
data_idx = 0
acc = np.zeros((N,3))
for s in range(i_steps):
vel += acc * dt/2.0
pos += vel * dt
for i in prange(N):
acc[i,0] = 0
acc[i,1] = 0
acc[i,2] = 0
for j in range(N):
dx = pos[j,0] - pos[i,0]
dy = pos[j,1] - pos[i,1]
dz = pos[j,2] - pos[i,2]
inv_r3 = (dx**2 + dy**2 + dz**2 + softening**2)**(-1.5)
acc[i,0] += G * (dx * inv_r3) * mass[j]
acc[i,1] += G * (dy * inv_r3) * mass[j]
acc[i,2] += G * (dz * inv_r3) * mass[j]
vel += acc * dt/2.0
if s % r_steps == 0:
pos_data[data_idx] = pos
vel_data[data_idx] = vel
data_idx += 1
return pos_data, vel_data
N = 10
dt = 60
pos = np.random.rand(N, 3)
vel = np.random.rand(N, 3)
m = np.random.rand(N)
softening = 1e3
G = 6.67430e-11
t_max = 3600*24*30
i_steps = int(t_max/dt)
r_steps = int(3600*24/dt)
r_i, v_i = nnleapfrog_integrate(pos, vel, m, i_steps, r_steps, dt, G, softening)
What I have already tried
Because only the for i in prange(N):
loop is suitable for parallelization, so I have separated it to the separate function getAcc
which is works fine with the parallel=True
flag and utilizes all the CPU cores.
from numba import jit, prange
import numpy as np
@jit('f8[:, ::1](f8[:, ::1], f8[::1], f8, f8)', nopython=True, parallel=True)
def getAcc( pos, mass, G, softening ):
N = pos.shape[0]
a = np.zeros((N,3))
for i in prange(N):
for j in range(N):
dx = pos[j,0] - pos[i,0]
dy = pos[j,1] - pos[i,1]
dz = pos[j,2] - pos[i,2]
inv_r3 = (dx**2 + dy**2 + dz**2 + softening**2)**(-1.5)
a[i,0] += G * (dx * inv_r3) * mass[j]
a[i,1] += G * (dy * inv_r3) * mass[j]
a[i,2] += G * (dz * inv_r3) * mass[j]
return a
@jit('Tuple((f8[:,:,::1],f8[:,:,::1]))(f8[:,::1], f8[:,::1], f8[::1], i8, i8, i8, f8, f8)', nopython=True)
def nleapfrog_integrate(pos, vel, mass, i_steps, r_steps, dt, G, softening):
N = pos.shape[0]
pos_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
vel_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
data_idx = 0
acc = getAcc(pos, mass, G, softening)
for i in range(i_steps):
vel += acc * dt/2.0
pos += vel * dt
acc = getAcc( pos, mass, G, softening )
vel += acc * dt/2.0
if i % r_steps == 0:
pos_data[data_idx] = pos
vel_data[data_idx] = vel
data_idx += 1
return pos_data, vel_data
N = 10
dt = 60
pos = np.random.rand(N, 3)
vel = np.random.rand(N, 3)
m = np.random.rand(N)
softening = 1e3
G = 6.67430e-11
t_max = 3600*24*30
i_steps = int(t_max/dt)
r_steps = int(3600*24/dt)
r_i, v_i = nleapfrog_integrate(pos, vel, m, i_steps, r_steps, dt, G, softening)
But it turned out to be more than 3 times slower than the single threaded version of the original function in which this cycle was inlined.
In [4]: %timeit r_i, v_i = nleapfrog_integrate(pos, vel, m, i_steps, r_steps, dt, G, softening)
8.51 s ± 46.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [5]: %timeit r_i, v_i = nnleapfrog_integrate(pos, vel, m, i_steps, r_steps, dt, G, softening)
2.53 s ± 18.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Therefore for the best performance, I need the original function with the inlined for i in prange(N):
loop to run in multi threads.
答案1
得分: 3
i-based循环的并行化效率不高,因为创建和同步线程很昂贵。实际上,在个人电脑上,这种开销通常至少为几十微秒,而在计算服务器上通常会更大,主要是因为额外的核心。问题是i_steps=43200
次迭代,因此这种开销将导致几秒钟的时间。使用N=10
时,没有足够的工作以有效地使用线程。
此外,请注意,Numba 0.57.0存在一个错误,会导致此代码出现分段错误,因此我不确定是否安全将其并行化。
幸运的是,可以优化串行代码:
x * y**(-1.5)
不高效,因为Numba使用昂贵的指数函数pow
来计算它。您可以改用x / (y * sqrt(y))
。这样做速度明显更快,因为大多数CPU都有一个集成的硬件单元来相对高效地计算平方根和除法。fastmath
选项未启用,因此Numba不能假设x*y
等于y*x
,从而阻止了一些优化。启用此标志可能会有风险,但可以在内部循环中通过预先计算值来手动进行优化。
以下是优化后的代码:
@jit('Tuple((f8[:,:,::1],f8[:,:,::1]))(f8[:,::1], f8[:,::1], f8[::1], i8, i8, i8, f8, f8)', nopython=True)
def nnleapfrog_integrate(pos, vel, mass, i_steps, r_steps, dt, G, softening):
N = pos.shape[0]
pos_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
vel_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
data_idx = 0
acc = np.zeros((N,3))
for s in range(i_steps):
vel += acc * dt/2.0
pos += vel * dt
for i in range(N):
acc[i,0] = 0.0
acc[i,1] = 0.0
acc[i,2] = 0.0
for j in range(N):
dx = pos[j,0] - pos[i,0]
dy = pos[j,1] - pos[i,1]
dz = pos[j,2] - pos[i,2]
tmp1 = dx**2 + dy**2 + dz**2 + softening**2
tmp2 = G * mass[j] / (tmp1 * np.sqrt(tmp1))
acc[i,0] += tmp2 * dx
acc[i,1] += tmp2 * dy
acc[i,2] += tmp2 * dz
vel += acc * dt/2.0
if s % r_steps == 0:
pos_data[data_idx] = pos
vel_data[data_idx] = vel
data_idx += 1
return pos_data, vel_data
在我的i5-9600KF处理器上,这段代码大约快了5倍。它运行时间约为31毫秒。这意味着包含循环的每次迭代只需要0.72微秒(远小于线程创建/同步的开销)。
进一步的优化包括预计算G * mass[j]
并使用SIMD指令计算除法/平方根。前者很容易实现,后者有点棘手,特别是在Numba中。
英文:
The parallelisation of the i-based loop is not efficient because creating and synchronizing threads is expensive. Indeed, this overhead is usually at least dozens of microseconds on PC and often significantly even bigger on computing server (mainly because of the additional cores). The thing is there is i_steps=43200
iteration so this overhead will result in few seconds. There is not enough work to use threads efficiently with N=10
.
Besides, note that there is a bug in Numba 0.57.0 causing a segmentation fault on this code so I am not sure this is even safe to parallelize it.
Fortunately, the serial code can be optimized:
x * y**(-1.5)
is not efficient because Numba use the expensive exponential functionpow
to compute it. You can usex / (y * sqrt(y))
instead. This is significantly faster because most CPUs have an integrated hardware unit to compute square root and division relatively efficiently.- The
fastmath
option is not enabled so Numba cannot assume thatx*y
is equal toy*x
preventing some optimizations. Enabling this flag can be dangerous, but the optimization can be done manually by pre-computing values in the inner loop.
Here is the resulting optimized code:
@jit('Tuple((f8[:,:,::1],f8[:,:,::1]))(f8[:,::1], f8[:,::1], f8[::1], i8, i8, i8, f8, f8)', nopython=True)
def nnleapfrog_integrate(pos, vel, mass, i_steps, r_steps, dt, G, softening):
N = pos.shape[0]
pos_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
vel_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
data_idx = 0
acc = np.zeros((N,3))
for s in range(i_steps):
vel += acc * dt/2.0
pos += vel * dt
for i in range(N):
acc[i,0] = 0.0
acc[i,1] = 0.0
acc[i,2] = 0.0
for j in range(N):
dx = pos[j,0] - pos[i,0]
dy = pos[j,1] - pos[i,1]
dz = pos[j,2] - pos[i,2]
tmp1 = dx**2 + dy**2 + dz**2 + softening**2
tmp2 = G * mass[j] / (tmp1 * np.sqrt(tmp1))
acc[i,0] += tmp2 * dx
acc[i,1] += tmp2 * dy
acc[i,2] += tmp2 * dz
vel += acc * dt/2.0
if s % r_steps == 0:
pos_data[data_idx] = pos
vel_data[data_idx] = vel
data_idx += 1
return pos_data, vel_data
This code is about 5 times faster on my machine with a i5-9600KF processor. It runs in approximately 31 ms. This means every iteration of the encompassing loop takes only 0.72 µs (far smaller than the overhead of thread creation/synchronization).
Further optimizations include pre-computing G * mass[j]
and computing the division/sqrt using SIMD instruction. The former is easy to do and the later is a bit tricky, especially in Numba.
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论