问题

I need a fast function to create a single stacked numpy array from a Pandas dataframe after grouping the dataframe, with missing row values added. The output array should have a shape (n_unique_values_1, n_unique_values_2, ..., n_ungrouped_columns) for the group by columns 1, 2, .... Missing values should be filled nan, you may assume that all values can be safely handed as numeric.

Example:

import numpy as np
import pandas as pd

df = df = pd.DataFrame({
    &#39;a&#39;: [1, 1, 2],
    &#39;b&#39;: [0, 1, 0],
    &#39;c&#39;: [1, 1, 1],
    &#39;d&#39;: [0, 0, 0]
})
grouped = df.groupby([&#39;a&#39;, &#39;b&#39;]).agg(sum)

I need a function on grouped which returns a numpy array of shape (df.a.nunique(), df.b.nunique(), n_ungrouped_cols) (in this case, (2, 2, 2)). The function should work with arbitrary lengths of group, the returned array should have the axis in the same order as the groupby, and I need to run this on lots of millions of values in a pipeline that has a lot to do. Fast as hell would be very much appreciated. Oh, Pandas groupby preserves the ascending order of the unique values and that should not be lost. But if you can write this without using a grouped dataframe, go for it. Any imports (numba, etc.) that can make this quick are acceptable as long as they are from well maintained code bases.

The following can be used to create tests cases, and test_case in this example has missing rows:

def create_synthetic_df(len_df, n_uniques: list[int]):
    rng = np.random.default_rng(seed=2)
    if len(n_uniques) &gt; 10:
        n_uniques = n_uniques[:10]
    dct = {}
    for col, n_unique in zip(&#39;abcdefghij&#39;, n_uniques):
        dct[col] = rng.integers(0, n_unique, size=len_df)
    return pd.DataFrame(dct)

n_uniques = (50, 3, 10, 10, 3)
test_case = create_synthetic_df(1000, n_uniques).groupby([&#39;a&#39;, &#39;b&#39;, &#39;c&#39;]).agg(sum)

def my_func(grouped_df) -&gt; np.ndarray:
    &quot;&quot;&quot;Call the solution &#39;my_func&#39;.&quot;&quot;&quot;
    ...

# additional test cases, maybe not exhaustive

simple_case = my_func(grouped)  
expected = np.array([
    [[1, 0],
     [1, 0]],
    [[1, 0],
     [np.nan, np.nan]]
])
assert simple_case.shape == (2, 2, 2)
assert np.allclose(simple_case, expected, equal_nan=True)
assert my_func(test_case).shape == (50, 3, 10, 2)

英文:

Example:

import numpy as np
import pandas as pd

df = df = pd.DataFrame({
    &#39;a&#39;: [1, 1, 2],
    &#39;b&#39;: [0, 1, 0],
    &#39;c&#39;: [1, 1, 1],
    &#39;d&#39;: [0, 0, 0]
})
grouped = df.groupby([&#39;a&#39;, &#39;b&#39;]).agg(sum)

The following can be used to create tests cases, and test_case in this example has missing rows:

def create_synthetic_df(len_df, n_uniques: list[int]):
    rng = np.random.default_rng(seed=2)
    if len(n_uniques) &gt; 10:
        n_uniques = n_uniques[:10]
    dct = {}
    for col, n_unique in zip(&#39;abcdefghij&#39;, n_uniques):
        dct[col] = rng.integers(0, n_unique, size=len_df)
    return pd.DataFrame(dct)

n_uniques = (50, 3, 10, 10, 3)
test_case = create_synthetic_df(1000, n_uniques).groupby([&#39;a&#39;, &#39;b&#39;, &#39;c&#39;]).agg(sum)

def my_func(grouped_df) -&gt; np.ndarray:
    &quot;&quot;&quot;Call the solution &#39;my_func&#39;.&quot;&quot;&quot;
    ...

# additional test cases, maybe not exhaustive

simple_case = my_func(grouped)  
expected = np.array([
    [[1, 0],
     [1, 0]],
    [[1, 0],
     [np.nan, np.nan]]
])
assert simple_case.shape == (2, 2, 2)
assert np.allclose(simple_case, expected, equal_nan=True)
assert my_func(test_case).shape == (50, 3, 10, 2)

答案1

得分: 1

以下是您要的代码部分的翻译：

Fundamentally this is a reindexing operation. There are trickier ways to do this, and I have not profiled this code; this is the "unsurprising" approach.

The second approach uses lower-level Numpy but I don't know which one will be faster. They are tested to be equivalent.

import string
from typing import Sequence

import numpy as np
import pandas as pd

def create_synthetic_df(len_df: int, n_uniques: Sequence[int], seed: int = 2) -> pd.DataFrame:
    rng = np.random.default_rng(seed=seed)
    df = pd.DataFrame(
        data=1 + rng.integers(low=0, high=n_uniques, size=(len_df, len(n_uniques))),
        columns=tuple(string.ascii_lowercase[:len(n_uniques)]),
    )
    return df

def mi_reindex(df: pd.DataFrame, group_cols: list[str]) -> np.ndarray:
    totals: pd.DataFrame = df.groupby(group_cols).sum()
    uindex = [
        totals.index.unique(level=level).sort_values()
        for level in group_cols
    ]
    full_index = pd.MultiIndex.from_product(iterables=uindex)

    aligned = totals.reindex(full_index)
    reshaped = aligned.values.reshape((
        *(
            u.size for u in uindex
        ),
        totals.columns.size,
    ))
    return reshaped

def np_unique(df: pd.DataFrame, group_cols: list[str]) -> np.ndarray:
    totals = df.groupby(group_cols).sum()

    uniques = [
        np.unique(
            ar=totals.index.get_level_values(col),
            return_inverse=True,
        )
        for col in group_cols
    ]

    dest = np.full(
        shape=(
            *(u.size for u in uniques),
            totals.columns.size,
        ),
        fill_value=np.nan,
    )

    idx = tuple(i for u, i in uniques) + (slice(None),)
    dest[idx] = totals
    return dest

def test() -> None:
    simple_outputs = []
    big_outputs = []

    big_uniques = (50, 3, 10, 10, 3)
    big_input = create_synthetic_df(1000, big_uniques)
    simple_input = pd.DataFrame({
        'a': [1, 1, 2],
        'b': [0, 1, 0],
        'c': [1, 1, 1],
        'd': [0, 0, 0]
    })
    simple_output = np.array([
        [[1, 0],
         [1, 0]],  # this is not 2
        [[1, 0],
         [np.nan, np.nan]]
    ])

    for my_func in (mi_reindex, np_unique):
        actual = my_func(simple_input, ['a', 'b'])
        assert actual.shape == (2, 2, 2)
        assert np.allclose(actual, simple_output, equal_nan=True)
        simple_outputs.append(actual)

        actual = my_func(big_input, ['a', 'b', 'c'])
        assert actual.shape == (50, 3, 10, 2)
        big_outputs.append(actual)

    assert np.allclose(*simple_outputs, equal_nan=True)
    assert np.allclose(*big_outputs, equal_nan=True)

if __name__ == '__main__':
    test()

希望这对您有所帮助。如果您需要更多帮助，请随时告诉我。

英文:

Fundamentally this is a reindexing operation. There are trickier ways to do this, and I have not profiled this code; this is the "unsurprising" approach.

The second approach uses lower-level Numpy but I don't know which one will be faster. They are tested to be equivalent.

import string
from typing import Sequence

import numpy as np
import pandas as pd


def create_synthetic_df(len_df: int, n_uniques: Sequence[int], seed: int = 2) -&gt; pd.DataFrame:
    rng = np.random.default_rng(seed=seed)
    df = pd.DataFrame(
        data=1 + rng.integers(low=0, high=n_uniques, size=(len_df, len(n_uniques))),
        columns=tuple(string.ascii_lowercase[:len(n_uniques)]),
    )
    return df


def mi_reindex(df: pd.DataFrame, group_cols: list[str]) -&gt; np.ndarray:
    totals: pd.DataFrame = df.groupby(group_cols).sum()
    uindex = [
        totals.index.unique(level=level).sort_values()
        for level in group_cols
    ]
    full_index = pd.MultiIndex.from_product(iterables=uindex)

    aligned = totals.reindex(full_index)
    reshaped = aligned.values.reshape((
        *(
            u.size for u in uindex
        ),
        totals.columns.size,
    ))
    return reshaped


def np_unique(df: pd.DataFrame, group_cols: list[str]) -&gt; np.ndarray:
    totals = df.groupby(group_cols).sum()

    uniques = [
        np.unique(
            ar=totals.index.get_level_values(col),
            return_inverse=True,
        )
        for col in group_cols
    ]

    dest = np.full(
        shape=(
            *(u.size for u, idx in uniques),
            totals.columns.size,
        ),
        fill_value=np.nan,
    )

    idx = tuple(i for u, i in uniques) + (slice(None),)
    dest[idx] = totals
    return dest


def test() -&gt; None:
    simple_outputs = []
    big_outputs = []

    big_uniques = (50, 3, 10, 10, 3)
    big_input = create_synthetic_df(1000, big_uniques)
    simple_input = pd.DataFrame({
        &#39;a&#39;: [1, 1, 2],
        &#39;b&#39;: [0, 1, 0],
        &#39;c&#39;: [1, 1, 1],
        &#39;d&#39;: [0, 0, 0]
    })
    simple_output = np.array([
        [[1, 0],
         [1, 0]],  # this is not 2
        [[1, 0],
         [np.nan, np.nan]]
    ])

    for my_func in (mi_reindex, np_unique):
        actual = my_func(simple_input, [&#39;a&#39;, &#39;b&#39;])
        assert actual.shape == (2, 2, 2)
        assert np.allclose(actual, simple_output, equal_nan=True)
        simple_outputs.append(actual)

        actual = my_func(big_input, [&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])
        assert actual.shape == (50, 3, 10, 2)
        big_outputs.append(actual)

    assert np.allclose(*simple_outputs, equal_nan=True)
    assert np.allclose(*big_outputs, equal_nan=True)


if __name__ == &#39;__main__&#39;:
    test()

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

从分组的数据框创建堆叠的NumPy数组。

问题

答案1

遍历一个包含未知数量的 numpy 数组作为其值的 n 维 numpy 数组。

从列表中提取字符串(/soup元素)的一部分。

比较 pandas 数据框的列，忽略文本前面的数字。

基于日期时间列在参考日期时间范围内分配值给数据框中的记录。

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

发表评论