英文:
Fastest way to compute n-gram overlap matrix in Python
问题
I have a large corpus of documents that I want to merge if they have a notable n-gram overlap (in my case, I'm considering bigrams). Consider the list of sets:
corpus = [{'example', 'bigram'}, {'another', 'example'}, {'some', 'outlier'}]
I have the following code to compute the similarity matrix:
import numpy as np
sim_matrix = np.zeros((len(corpus), len(corpus)))
for i in range(len(corpus)):
for j in range(i+1, len(corpus)):
sim_matrix[i][j] = get_ngram_overlap(corpus[i], corpus[j])
np.fill_diagonal(sim_matrix, 1)
Where the get_ngram_overlap
function is defined as follows:
def get_ngram_overlap(ngrams_s1, ngrams_s2):
if min(len(ngrams_s1), len(ngrams_s2)) == 0:
return 0
common_ngrams = ngrams_s1 & ngrams_s2
return len(common_ngrams)/min(len(ngrams_s1), len(ngrams_s2))
The result is a MxM matrix, where M is the number of n-grams in my corpus.
The problem is, when M grows large, the code is really slow, and this becomes computationally expensive because of the necessity to compare all unique pairs. Are there ways to compute it more efficiently, like using an optimized distance matrix computation with a custom similarity metric that would work for sets of strings? Any help is appreciated!
英文:
I have a large corpus of documents that I want to merge if they have a notable n-gram overlap (in my case, I'm considering bigrams). Consider the list of sets:
corpus = [{'example', 'bigram'}, {'another', 'example'}, {'some', 'outlier'}]
I have the following code to compute the similarity matrix:
import numpy as np
sim_matrix = np.zeros((len(corpus), len(corpus)))
for i in range(len(corpus)):
for j in range(i+1, len(corpus)):
sim_matrix[i][j] = get_ngram_overlap(corpus[i], corpus[j])
np.fill_diagonal(sim_matrix, 1)
Where the get_ngram_overlap
function is defined as follows:
def get_ngram_overlap(ngrams_s1, ngrams_s2):
if min(len(ngrams_s1), len(ngrams_s2)) == 0:
return 0
common_ngrams = ngrams_s1 & ngrams_s2
return len(common_ngrams)/min(len(ngrams_s1), len(ngrams_s2))
The result is a MxM matrix, where the M is the number of n-grams in my corpus
print(sim_matrix)
>> [[1. 0.5 0.]
[0. 1. 0.]
[0. 0. 1.]]
The problem is, when the M grows large, the code is really slow and this becomes really computationally expensive, because of the necessity to compare all unique pairs. Are there ways to compute it more efficiently? Like using an optimized distance matrix computation with a custom similarity metric that would work for sets of strings. Any help is appreciated!
答案1
得分: 1
使用itertools.combinations
(以获取连续的bigrams对)和对数组的三角形计算(np.triu_indices
):
from itertools import combinations
def get_ngram_overlap(ngrams_s1, ngrams_s2):
min_len = min(len(ngrams_s1), len(ngrams_s2))
if min_len == 0:
return 0
return len(ngrams_s1 & ngrams_s2) / min_len
corpus = [{'example', 'bigram'}, {'another', 'example'}, {'some', 'outlier'}]
sim_matrix = np.zeros((len(corpus), len(corpus)))
# 填充上三角形部分的计算重叠值
sim_matrix[np.triu_indices(len(sim_matrix), 1)] = \
list(get_ngram_overlap(*c) for c in combinations(corpus, 2))
np.fill_diagonal(sim_matrix, 1)
print(sim_matrix)
输出:
[[1. 0.5 0. ]
[0. 1. 0. ]
[0. 0. 1. ]]
英文:
With itertools.combinations
(to get consecutive pairs of bigrams) and triangular computation (np.triu_indices
) of an array:
from itertools import combinations
def get_ngram_overlap(ngrams_s1, ngrams_s2):
min_len = min(len(ngrams_s1), len(ngrams_s2))
if min_len == 0:
return 0
return len(ngrams_s1 & ngrams_s2) / min_len
corpus = [{'example', 'bigram'}, {'another', 'example'}, {'some', 'outlier'}]
sim_matrix = np.zeros((len(corpus), len(corpus)))
# fill upper triangle with calculated overlaps
sim_matrix[np.triu_indices(len(sim_matrix), 1)] = \
list(get_ngram_overlap(*c) for c in combinations(corpus, 2))
np.fill_diagonal(sim_matrix, 1)
print(sim_matrix)
[[1. 0.5 0. ]
[0. 1. 0. ]
[0. 0. 1. ]]
答案2
得分: 1
以下是您提供的代码的翻译部分:
# 你可以通过首先构建索引并仅对具有共同单词的组合运行 `get_ngram_overlap()` 来加快计算速度:
import numpy as np
from itertools import combinations
# 将语料库中的项变为冻结集(以使它们可哈希化)
corpus = [frozenset({'示例', '二元组'}), frozenset({'另一个', '示例'}), frozenset({'一些', '离群值'})]
def get_ngram_overlap(ngrams_s1, ngrams_s2):
mn = min(len(ngrams_s1), len(ngrams_s2))
if mn == 0:
return 0
common_ngrams = ngrams_s1 & ngrams_s2
return len(common_ngrams)/mn
index, nums = {}, {}
for i, b in enumerate(corpus):
for word in b:
index.setdefault(word, []).append(b)
nums.setdefault(b, []).append(i)
index2 = {}
for k, v in index.items():
for c in combinations(v, 2):
index2[c] = get_ngram_overlap(*c)
sim_matrix = np.zeros((len(corpus), len(corpus)))
for a, b in index2:
for x in nums[a]:
for y in nums[b]:
sim_matrix[(x, y) if x < y else (y, x)] = index2[(a, b)]
np.fill_diagonal(sim_matrix, 1)
print(sim_matrix)
打印结果:
[[1. 0.5 0. ]
[0. 1. 0. ]
[0. 0. 1. ]]
与包含 500 个不同单词的 10,000 个二元组的基准测试:
import random
import numpy as np
from timeit import timeit
from itertools import combinations
random.seed(123)
corpus = [frozenset({f'word{random.randint(1, 500)}', f'word{random.randint(1, 500)}'}) for _ in range(10_000)]
def get_ngram_overlap(ngrams_s1, ngrams_s2):
mn = min(len(ngrams_s1), len(ngrams_s2))
if mn == 0:
return 0
common_ngrams = ngrams_s1 & ngrams_s2
return len(common_ngrams)/mn
def fn1():
sim_matrix = np.zeros((len(corpus), len(corpus)))
for i in range(len(corpus)):
for j in range(i+1, len(corpus)):
sim_matrix[i][j] = get_ngram_overlap(corpus[i], corpus[j])
np.fill_diagonal(sim_matrix, 1)
return sim_matrix
def fn2():
index, nums = {}, {}
for i, b in enumerate(corpus):
for word in b:
index.setdefault(word, []).append(b)
nums.setdefault(b, []).append(i)
index2 = {}
for k, v in index.items():
for c in combinations(v, 2):
index2[c] = get_ngram_overlap(*c)
sim_matrix = np.zeros((len(corpus), len(corpus)))
for a, b in index2:
for x in nums[a]:
for y in nums[b]:
sim_matrix[(x, y) if x < y else (y, x)] = index2[(a, b)]
np.fill_diagonal(sim_matrix, 1)
return sim_matrix
assert np.array_equal(fn1(), fn2())
t1 = timeit(fn1, number=1)
t2 = timeit(fn2, number=1)
print(t1)
print(t2)
在我的机器上运行结果(Python 3.10/AMD Ryzen 5700x):
19.253960204077885
0.37714757514186203
英文:
You can make the computation faster to first build an index and run the get_ngram_overlap()
only on combinations which have common words:
import numpy as np
from itertools import combinations
# make the items in corpus frozensets (to make them hashable)
corpus = [frozenset({'example', 'bigram'}), frozenset({'another', 'example'}), frozenset({'some', 'outlier'})]
def get_ngram_overlap(ngrams_s1, ngrams_s2):
mn = min(len(ngrams_s1), len(ngrams_s2))
if mn == 0:
return 0
common_ngrams = ngrams_s1 & ngrams_s2
return len(common_ngrams)/mn
index, nums = {}, {}
for i, b in enumerate(corpus):
for word in b:
index.setdefault(word, []).append(b)
nums.setdefault(b, []).append(i)
index2 = {}
for k, v in index.items():
for c in combinations(v, 2):
index2[c] = get_ngram_overlap(*c)
sim_matrix = np.zeros((len(corpus), len(corpus)))
for a, b in index2:
for x in nums[a]:
for y in nums[b]:
sim_matrix[(x, y) if x < y else (y, x)] = index2[(a, b)]
np.fill_diagonal(sim_matrix, 1)
print(sim_matrix)
Prints:
[[1. 0.5 0. ]
[0. 1. 0. ]
[0. 0. 1. ]]
A benchmark with 10k bigrams of 500 different words:
import random
import numpy as np
from timeit import timeit
from itertools import combinations
random.seed(123)
corpus = [frozenset({f'word{random.randint(1, 500)}', f'word{random.randint(1, 500)}'}) for _ in range(10_000)]
def get_ngram_overlap(ngrams_s1, ngrams_s2):
mn = min(len(ngrams_s1), len(ngrams_s2))
if mn == 0:
return 0
common_ngrams = ngrams_s1 & ngrams_s2
return len(common_ngrams)/mn
def fn1():
sim_matrix = np.zeros((len(corpus), len(corpus)))
for i in range(len(corpus)):
for j in range(i+1, len(corpus)):
sim_matrix[i][j] = get_ngram_overlap(corpus[i], corpus[j])
np.fill_diagonal(sim_matrix, 1)
return sim_matrix
def fn2():
index, nums = {}, {}
for i, b in enumerate(corpus):
for word in b:
index.setdefault(word, []).append(b)
nums.setdefault(b, []).append(i)
index2 = {}
for k, v in index.items():
for c in combinations(v, 2):
index2[c] = get_ngram_overlap(*c)
sim_matrix = np.zeros((len(corpus), len(corpus)))
for a, b in index2:
for x in nums[a]:
for y in nums[b]:
sim_matrix[(x, y) if x < y else (y, x)] = index2[(a, b)]
np.fill_diagonal(sim_matrix, 1)
return sim_matrix
assert np.array_equal(fn1(), fn2())
t1 = timeit(fn1, number=1)
t2 = timeit(fn2, number=1)
print(t1)
print(t2)
Prints on my machine (Python 3.10/AMD Ryzen 5700x):
19.253960204077885
0.37714757514186203
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论