Fastest way to compute n-gram overlap matrix in Python

huangapple go评论56阅读模式
英文:

Fastest way to compute n-gram overlap matrix in Python

问题

I have a large corpus of documents that I want to merge if they have a notable n-gram overlap (in my case, I'm considering bigrams). Consider the list of sets:

corpus = [{'example', 'bigram'}, {'another', 'example'}, {'some', 'outlier'}]

I have the following code to compute the similarity matrix:

import numpy as np

sim_matrix = np.zeros((len(corpus), len(corpus)))
for i in range(len(corpus)):
    for j in range(i+1, len(corpus)):
        sim_matrix[i][j] = get_ngram_overlap(corpus[i], corpus[j])

np.fill_diagonal(sim_matrix, 1)

Where the get_ngram_overlap function is defined as follows:

def get_ngram_overlap(ngrams_s1, ngrams_s2):

    if min(len(ngrams_s1), len(ngrams_s2)) == 0:
        return 0

    common_ngrams = ngrams_s1 & ngrams_s2

    return len(common_ngrams)/min(len(ngrams_s1), len(ngrams_s2))

The result is a MxM matrix, where M is the number of n-grams in my corpus.

The problem is, when M grows large, the code is really slow, and this becomes computationally expensive because of the necessity to compare all unique pairs. Are there ways to compute it more efficiently, like using an optimized distance matrix computation with a custom similarity metric that would work for sets of strings? Any help is appreciated!

英文:

I have a large corpus of documents that I want to merge if they have a notable n-gram overlap (in my case, I'm considering bigrams). Consider the list of sets:

corpus = [{'example', 'bigram'}, {'another', 'example'}, {'some', 'outlier'}]

I have the following code to compute the similarity matrix:

import numpy as np

sim_matrix = np.zeros((len(corpus), len(corpus)))
for i in range(len(corpus)):
    for j in range(i+1, len(corpus)):
        sim_matrix[i][j] = get_ngram_overlap(corpus[i], corpus[j])
        
np.fill_diagonal(sim_matrix, 1)

Where the get_ngram_overlap function is defined as follows:

def get_ngram_overlap(ngrams_s1, ngrams_s2):

    if min(len(ngrams_s1), len(ngrams_s2)) == 0:
        return 0

    common_ngrams = ngrams_s1 & ngrams_s2

    return len(common_ngrams)/min(len(ngrams_s1), len(ngrams_s2))

The result is a MxM matrix, where the M is the number of n-grams in my corpus

print(sim_matrix)
>> [[1.  0.5  0.]
   [0.  1.   0.]
   [0.  0.   1.]]

The problem is, when the M grows large, the code is really slow and this becomes really computationally expensive, because of the necessity to compare all unique pairs. Are there ways to compute it more efficiently? Like using an optimized distance matrix computation with a custom similarity metric that would work for sets of strings. Any help is appreciated!

答案1

得分: 1

使用itertools.combinations(以获取连续的bigrams对)和对数组的三角形计算(np.triu_indices):

from itertools import combinations

def get_ngram_overlap(ngrams_s1, ngrams_s2):
    min_len = min(len(ngrams_s1), len(ngrams_s2))
    if min_len == 0:
        return 0
    return len(ngrams_s1 & ngrams_s2) / min_len

corpus = [{'example', 'bigram'}, {'another', 'example'}, {'some', 'outlier'}]
sim_matrix = np.zeros((len(corpus), len(corpus)))

# 填充上三角形部分的计算重叠值
sim_matrix[np.triu_indices(len(sim_matrix), 1)] = \
            list(get_ngram_overlap(*c) for c in combinations(corpus, 2))
np.fill_diagonal(sim_matrix, 1)
print(sim_matrix)

输出:

[[1.  0.5 0. ]
 [0.  1.  0. ]
 [0.  0.  1. ]]
英文:

With itertools.combinations (to get consecutive pairs of bigrams) and triangular computation (np.triu_indices) of an array:

from itertools import combinations

def get_ngram_overlap(ngrams_s1, ngrams_s2):
    min_len = min(len(ngrams_s1), len(ngrams_s2))
    if min_len == 0:
        return 0
    return len(ngrams_s1 & ngrams_s2) / min_len

corpus = [{'example', 'bigram'}, {'another', 'example'}, {'some', 'outlier'}]
sim_matrix = np.zeros((len(corpus), len(corpus)))

# fill upper triangle with calculated overlaps 
sim_matrix[np.triu_indices(len(sim_matrix), 1)] = \
            list(get_ngram_overlap(*c) for c in combinations(corpus, 2))
np.fill_diagonal(sim_matrix, 1)
print(sim_matrix)

[[1.  0.5 0. ]
 [0.  1.  0. ]
 [0.  0.  1. ]]

答案2

得分: 1

以下是您提供的代码的翻译部分:

# 你可以通过首先构建索引并仅对具有共同单词的组合运行 `get_ngram_overlap()` 来加快计算速度:

import numpy as np
from itertools import combinations

# 将语料库中的项变为冻结集(以使它们可哈希化)
corpus = [frozenset({'示例', '二元组'}), frozenset({'另一个', '示例'}), frozenset({'一些', '离群值'})]

def get_ngram_overlap(ngrams_s1, ngrams_s2):
    mn = min(len(ngrams_s1), len(ngrams_s2))
    if mn == 0:
        return 0
    common_ngrams = ngrams_s1 & ngrams_s2
    return len(common_ngrams)/mn

index, nums = {}, {}
for i, b in enumerate(corpus):
    for word in b:
        index.setdefault(word, []).append(b)
    nums.setdefault(b, []).append(i)

index2 = {}
for k, v in index.items():
    for c in combinations(v, 2):
        index2[c] = get_ngram_overlap(*c)

sim_matrix = np.zeros((len(corpus), len(corpus)))

for a, b in index2:
    for x in nums[a]:
        for y in nums[b]:
            sim_matrix[(x, y) if x < y else (y, x)] = index2[(a, b)]

np.fill_diagonal(sim_matrix, 1)
print(sim_matrix)

打印结果:

[[1.  0.5 0. ]
 [0.  1.  0. ]
 [0.  0.  1. ]]

与包含 500 个不同单词的 10,000 个二元组的基准测试:

import random
import numpy as np
from timeit import timeit
from itertools import combinations

random.seed(123)
corpus = [frozenset({f'word{random.randint(1, 500)}', f'word{random.randint(1, 500)}'}) for _ in range(10_000)]

def get_ngram_overlap(ngrams_s1, ngrams_s2):
    mn = min(len(ngrams_s1), len(ngrams_s2))
    if mn == 0:
        return 0
    common_ngrams = ngrams_s1 & ngrams_s2
    return len(common_ngrams)/mn

def fn1():
    sim_matrix = np.zeros((len(corpus), len(corpus)))
    for i in range(len(corpus)):
        for j in range(i+1, len(corpus)):
            sim_matrix[i][j] = get_ngram_overlap(corpus[i], corpus[j])

    np.fill_diagonal(sim_matrix, 1)
    return sim_matrix

def fn2():
    index, nums = {}, {}
    for i, b in enumerate(corpus):
        for word in b:
            index.setdefault(word, []).append(b)
        nums.setdefault(b, []).append(i)

    index2 = {}
    for k, v in index.items():
        for c in combinations(v, 2):
            index2[c] = get_ngram_overlap(*c)

    sim_matrix = np.zeros((len(corpus), len(corpus)))

    for a, b in index2:
        for x in nums[a]:
            for y in nums[b]:
                sim_matrix[(x, y) if x < y else (y, x)] = index2[(a, b)]

    np.fill_diagonal(sim_matrix, 1)
    return sim_matrix

assert np.array_equal(fn1(), fn2())

t1 = timeit(fn1, number=1)
t2 = timeit(fn2, number=1)

print(t1)
print(t2)

在我的机器上运行结果(Python 3.10/AMD Ryzen 5700x):

19.253960204077885
0.37714757514186203
英文:

You can make the computation faster to first build an index and run the get_ngram_overlap() only on combinations which have common words:

import numpy as np
from itertools import combinations

# make the items in corpus frozensets (to make them hashable)
corpus = [frozenset({&#39;example&#39;, &#39;bigram&#39;}), frozenset({&#39;another&#39;, &#39;example&#39;}), frozenset({&#39;some&#39;, &#39;outlier&#39;})]

def get_ngram_overlap(ngrams_s1, ngrams_s2):
    mn = min(len(ngrams_s1), len(ngrams_s2))
    if mn == 0:
        return 0
    common_ngrams = ngrams_s1 &amp; ngrams_s2
    return len(common_ngrams)/mn


index, nums = {}, {}
for i, b in enumerate(corpus):
    for word in b:
        index.setdefault(word, []).append(b)
    nums.setdefault(b, []).append(i)

index2 = {}
for k, v in index.items():
    for c in combinations(v, 2):
        index2[c] = get_ngram_overlap(*c)

sim_matrix = np.zeros((len(corpus), len(corpus)))

for a, b in index2:
    for x in nums[a]:
        for y in nums[b]:
            sim_matrix[(x, y) if x &lt; y else (y, x)] = index2[(a, b)]

np.fill_diagonal(sim_matrix, 1)
print(sim_matrix)

Prints:

[[1.  0.5 0. ]
 [0.  1.  0. ]
 [0.  0.  1. ]]

A benchmark with 10k bigrams of 500 different words:

import random
import numpy as np
from timeit import timeit
from itertools import combinations


random.seed(123)
corpus = [frozenset({f&#39;word{random.randint(1, 500)}&#39;, f&#39;word{random.randint(1, 500)}&#39;}) for _ in range(10_000)]

def get_ngram_overlap(ngrams_s1, ngrams_s2):
    mn = min(len(ngrams_s1), len(ngrams_s2))
    if mn == 0:
        return 0
    common_ngrams = ngrams_s1 &amp; ngrams_s2
    return len(common_ngrams)/mn


def fn1():
    sim_matrix = np.zeros((len(corpus), len(corpus)))
    for i in range(len(corpus)):
        for j in range(i+1, len(corpus)):
            sim_matrix[i][j] = get_ngram_overlap(corpus[i], corpus[j])

    np.fill_diagonal(sim_matrix, 1)
    return sim_matrix

def fn2():
    index, nums = {}, {}
    for i, b in enumerate(corpus):
        for word in b:
            index.setdefault(word, []).append(b)
        nums.setdefault(b, []).append(i)

    index2 = {}
    for k, v in index.items():
        for c in combinations(v, 2):
            index2[c] = get_ngram_overlap(*c)

    sim_matrix = np.zeros((len(corpus), len(corpus)))

    for a, b in index2:
        for x in nums[a]:
            for y in nums[b]:
                sim_matrix[(x, y) if x &lt; y else (y, x)] = index2[(a, b)]

    np.fill_diagonal(sim_matrix, 1)
    return sim_matrix

assert np.array_equal(fn1(), fn2())

t1 = timeit(fn1, number=1)
t2 = timeit(fn2, number=1)

print(t1)
print(t2)

Prints on my machine (Python 3.10/AMD Ryzen 5700x):

19.253960204077885
0.37714757514186203

huangapple
  • 本文由 发表于 2023年5月7日 01:03:17
  • 转载请务必保留本文链接:https://go.coder-hub.com/76190115.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定