英文:
Resuming count with numpy.unique from previously computed count
问题
I was wondering if there were already existing solutions for "resumable" computations with numpy.
Let me explain: I have a folder with a big amount of grayscale images over which I need to compute a sort of histogram using the numpy.unique function. My code looks like this:
from os import listdir
from os.path import isfile, join
import numpy as np
import matplotlib.image as img
import matplotlib.pyplot as plt
# storing all the images' names that need to be processed into a list:
work_dir = 'path/to/my/images'
images = [(work_dir + '/' + f) for f in listdir(work_dir) if isfile(join(work_dir, f))]
# allocating array that will contain the images' data:
nz = len(images)
nx, ny = img.imread(images[0]).shape
volume = np.zeros((nx, ny, nz), img.imread(images[0]).dtype)
print(volume.shape, nx*ny*nz, volume.dtype)
# loading the images into the allocated array:
for i in range(nz):
volume[:,:,i] = img.imread(images[i])
# computing the histogram as the number of occurrences of each unique value in volume:
values, counts = np.unique(volume, return_counts=True)
plt.plot(values, counts)
The problem is that my computer doesn't have enough RAM to allocate the necessary memory for both volume, values, and counts arrays.
So is there an already existing solution that would look like this:
from os import listdir
from os.path import isfile, join
import numpy as np
import matplotlib.image as img
import matplotlib.pyplot as plt
# storing all the images' names that need to be processed into a list:
work_dir = 'path/to/my/images'
images = [(work_dir + '/' + f) for f in listdir(work_dir) if isfile(join(work_dir, f))]
# computing the histogram as the number of occurrences of each unique value in the first image:
values, counts = np.unique(img.imread(images[0]), return_counts=True)
# updating values and counts to include data from the other images:
for i in range(len(images)):
old_values, old_counts = values, counts
values, counts = update_unique(img.imread(images[i]), old_values, old_counts, return_counts=True)
plt.plot(values, counts)
I would rather avoid having to implement something myself because of time constraints. I am also open to alternatives that do not use numpy or even python.
英文:
I was wondering if there were already existing solutions for "resumable" computations with numpy.
Let me explain: I have a folder with a big amount of grayscale images over which I need to compute a sort of histogram using the numpy.unique function. My code looks like this:
from os import listdir
from os.path import isfile, join
import numpy as np
import matplotlib.image as img
import matplotlib.pyplot as plt
# storing all the images' names that need to be processed into a list:
work_dir = 'path/to/my/images'
images = [(work_dir + '/' + f) for f in listdir(work_dir) if isfile(join(work_dir, f))]
# allocating array that will contain the images' data:
nz = len(images)
nx, ny = img.imread(images[0]).shape
volume = np.zeros((nx, ny, nz), img.imread(images[0]).dtype)
print(volume.shape, nx*ny*nz, volume.dtype)
# loading the images into the allocated array:
for i in range(nz):
volume[:,:,i] = img.imread(images[i])
# computing the histogram as the number of occurrences of each unique value in volume:
values, counts = np.unique(volume, return_counts=True)
plt.plot(values, counts)
The problem is that my computer doesn't have enough RAM to allocate the necessary memory for both volume, values and counts arrays.
So is there an already existing solution that would look like this:
from os import listdir
from os.path import isfile, join
import numpy as np
import matplotlib.image as img
import matplotlib.pyplot as plt
# storing all the images' names that need to be processed into a list:
work_dir = 'path/to/my/images'
images = [(work_dir + '/' + f) for f in listdir(work_dir) if isfile(join(work_dir, f))]
# computing the histogram as the number of occurrences of each unique value in the first image:
values, counts = np.unique(img.imread(images[0]), return_counts=True)
# updating values and counts to include data from the other images:
for i in range(len(images)):
old_values, old_counts = values, counts
values, counts = update_unique(img.imread(images[i]), old_values, old_counts, return_counts=True)
plt.plot(values, counts)
I would rather avoid having to implement something myself because of time constraints. I am also open to alternatives that do not use numpy or even python.
答案1
得分: 0
我有一些空闲时间,所以我尝试自己弄清楚如何做这个。我把它发布在这里,以防有人对做类似的事情感兴趣。我相信我的解决方案应该足够通用,可以用于需要汇总来自几个单独计算的结果并解决重复项的其他计算。
from os import listdir
from os.path import isfile, join
import numpy as np
import matplotlib.image as img
import matplotlib.pyplot as plt
def update_unique(a, old_values, old_counts):
# 首先,计算 a 的值和计数
new_values, new_counts = np.unique(a, return_counts=True)
# 我们将遍历 old_values 和 new_values 的所有内容
M, N = len(old_values), len(new_values)
i, j, k = 0, 0, 0
# 在这个过程中,我们将在两个新数组中存储合并后的值和计数,
# 尽管我们不知道合并后的值和计数的最终大小,但最多将保留 old_values 和 new_values 的全部内容
full_values = np.zeros(M + N, dtype=old_values.dtype)
full_counts = np.zeros(M + N, dtype=old_counts.dtype)
while i < M or j < N:
# 关键在于记住 unique 输出的值已经排序,因此每一步只有三种情况,以及当我们已经到达 old_values 或 new_values 的末尾时的两种特殊情况
if i >= M:
full_values[k] = new_values[j]
full_counts[k] = new_counts[j]
j += 1
k += 1
elif j >= N:
full_values[k] = old_values[i]
full_counts[k] = old_counts[i]
i += 1
k += 1
else:
if old_values[i] < new_values[j]:
full_values[k] = old_values[i]
full_counts[k] = old_counts[i]
i += 1
k += 1
elif old_values[i] > new_values[j]:
full_values[k] = new_values[j]
full_counts[k] = new_counts[j]
j += 1
k += 1
else: # old_values[i] == new_values[j]
full_values[k] = old_values[i]
full_counts[k] = old_counts[i] + new_counts[j]
i += 1
j += 1
k += 1
# 最后,我们只需要截断未使用的内存
return full_values[:k], full_counts[:k]
def unique_over_folder(files_list):
# 计算直方图,即第一张图中每个唯一值的出现次数:
values, counts = np.unique(img.imread(files_list[0]), return_counts=True)
# 更新值和计数以包括其他图像的数据:
for i in range(1, len(files_list)):
values, counts = update_unique(
img.imread(files_list[i]),
values, counts
)
return values, counts
# 将需要处理的所有图像名称存储在列表中:
work_dir = 'path/to/your/data'
images = [(work_dir + '/' + f) for f in listdir(work_dir) if isfile(join(work_dir, f))]
values0, counts0 = np.unique(img.imread(images[0]), return_counts=True)
values1000, counts1000 = np.unique(img.imread(images[1000]), return_counts=True)
values_fus, counts_fus = update_unique(img.imread(images[1000]), values0, counts0)
# 示例:合并两个值/计数对
fig, axs = plt.subplots(2, 1, sharex=True)
axs[0].plot(values0, counts0, '.', label='image 0')
axs[0].plot(values1000, counts1000, '.', label='image 1000')
axs[0].plot(values_fus, counts_fus, '.', label='combined')
axs[0].legend()
axs[0].set_title('Example combining two values/counts pairs')
# 示例:使用完整图像列表
values, counts = unique_over_folder(images)
axs[1].plot(values, counts, '.')
axs[1].set_title('Example with full list of images')
fig.savefig('exampes.png')
英文:
I've had a little free time, so I tried to figure out how to do this on my own. I'm posting it here in case someone is interested in doing something similar. I believe my solution should be general enough that it can be used for other computations that need to aggregate the results from several separate computations while resolving duplicates.
from os import listdir
from os.path import isfile, join
import numpy as np
import matplotlib.image as img
import matplotlib.pyplot as plt
def update_unique(a, old_values, old_counts):
# first, compute the values and counts of a
new_values, new_counts = np.unique(a, return_counts=True)
# we're going to go through all the content of old_values and new_values
M, N = len(old_values), len(new_values)
i, j, k = 0, 0, 0
# and as we do so, we're going to store the combined values and counts in two new arrays and, although we don't know the final size of the combined values and counts, at most we will be keeping the full content of old_values nd new_values
full_values = np.zeros(M + N, dtype=old_values.dtype)
full_counts = np.zeros(M + N, dtype=old_counts.dtype)
while i < M or j < N:
# the trick is to remember that the values output by unique are already sorted, so there can only be 3 scenarios at each step plus 2 edge cases when we've reached the end of either old_values or new_values
if i >= M:
full_values[k] = new_values[j]
full_counts[k] = new_counts[j]
j += 1
k += 1
elif j >= N:
full_values[k] = old_values[i]
full_counts[k] = old_counts[i]
i += 1
k += 1
else:
if old_values[i] < new_values[j]:
full_values[k] = old_values[i]
full_counts[k] = old_counts[i]
i += 1
k += 1
elif old_values[i] > new_values[j]:
full_values[k] = new_values[j]
full_counts[k] = new_counts[j]
j += 1
k += 1
else: # old_values[i] == new_values[j]
full_values[k] = old_values[i]
full_counts[k] = old_counts[i] + new_counts[j]
i += 1
j += 1
k += 1
# at the end, we just need to truncate the unused memory
return full_values[:k], full_counts[:k]
def unique_over_folder(files_list):
# computing the histogram as the number of occurrences of each unique value in the first image:
values, counts = np.unique(img.imread(files_list[0]), return_counts=True)
# updating values and counts to include data from the other images:
for i in range(1, len(files_list)):
values, counts = update_unique(
img.imread(files_list[i]),
values, counts
)
return values, counts
# storing all the images' names that need to be processed into a list:
work_dir = 'path/to/your/data'
images = [(work_dir + '/' + f) for f in listdir(work_dir) if isfile(join(work_dir, f))]
values0, counts0 = np.unique(img.imread(images[0]), return_counts=True)
values1000, counts1000 = np.unique(img.imread(images[1000]), return_counts=True)
values_fus, counts_fus = update_unique(img.imread(images[1000]), values0, counts0)
# Example combining two values/counts pairs:
fig, axs = plt.subplots(2, 1, sharex=True)
axs[0].plot(values0, counts0, '.', label='image 0')
axs[0].plot(values1000, counts1000, '.', label='image 1000')
axs[0].plot(values_fus, counts_fus, '.', label='combined')
axs[0].legend()
axs[0].set_title('Example combining two values/counts pairs')
# Example with full list of images:
values, counts = unique_over_folder(images)
axs[1].plot(values, counts, '.')
axs[1].set_title('Example with full list of images');
fig.savefig('exampes.png')
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论