英文:
Create stacked numpy array from grouped dataframe
问题
I need a fast function to create a single stacked numpy array from a Pandas dataframe after grouping the dataframe, with missing row values added. The output array should have a shape (n_unique_values_1, n_unique_values_2, ..., n_ungrouped_columns) for the group by columns 1, 2, .... Missing values should be filled nan, you may assume that all values can be safely handed as numeric.
Example:
import numpy as np
import pandas as pd
df = df = pd.DataFrame({
'a': [1, 1, 2],
'b': [0, 1, 0],
'c': [1, 1, 1],
'd': [0, 0, 0]
})
grouped = df.groupby(['a', 'b']).agg(sum)
I need a function on grouped
which returns a numpy array of shape (df.a.nunique(), df.b.nunique(), n_ungrouped_cols)
(in this case, (2, 2, 2)
). The function should work with arbitrary lengths of group, the returned array should have the axis in the same order as the groupby, and I need to run this on lots of millions of values in a pipeline that has a lot to do. Fast as hell would be very much appreciated. Oh, Pandas groupby preserves the ascending order of the unique values and that should not be lost. But if you can write this without using a grouped dataframe, go for it. Any imports (numba, etc.) that can make this quick are acceptable as long as they are from well maintained code bases.
The following can be used to create tests cases, and test_case
in this example has missing rows:
def create_synthetic_df(len_df, n_uniques: list[int]):
rng = np.random.default_rng(seed=2)
if len(n_uniques) > 10:
n_uniques = n_uniques[:10]
dct = {}
for col, n_unique in zip('abcdefghij', n_uniques):
dct[col] = rng.integers(0, n_unique, size=len_df)
return pd.DataFrame(dct)
n_uniques = (50, 3, 10, 10, 3)
test_case = create_synthetic_df(1000, n_uniques).groupby(['a', 'b', 'c']).agg(sum)
def my_func(grouped_df) -> np.ndarray:
"""Call the solution 'my_func'."""
...
# additional test cases, maybe not exhaustive
simple_case = my_func(grouped)
expected = np.array([
[[1, 0],
[1, 0]],
[[1, 0],
[np.nan, np.nan]]
])
assert simple_case.shape == (2, 2, 2)
assert np.allclose(simple_case, expected, equal_nan=True)
assert my_func(test_case).shape == (50, 3, 10, 2)
英文:
I need a fast function to create a single stacked numpy array from a Pandas dataframe after grouping the dataframe, with missing row values added. The output array should have a shape (n_unique_values_1, n_unique_values_2, ..., n_ungrouped_columns) for the group by columns 1, 2, .... Missing values should be filled nan, you may assume that all values can be safely handed as numeric.
Example:
import numpy as np
import pandas as pd
df = df = pd.DataFrame({
'a': [1, 1, 2],
'b': [0, 1, 0],
'c': [1, 1, 1],
'd': [0, 0, 0]
})
grouped = df.groupby(['a', 'b']).agg(sum)
I need a function on grouped
which returns a numpy array of shape (df.a.nunique(), df.b.nunique(), n_ungrouped_cols)
(in this case, (2, 2, 2)
). The function should work with arbitrary lengths of group, the returned array should have the axis in the same order as the groupby, and I need to run this on lots of millions of values in a pipeline that has a lot to do. Fast as hell would be very much appreciated. Oh, Pandas groupby preserves the ascending order of the unique values and that should not be lost. But if you can write this without using a grouped dataframe, go for it. Any imports (numba, etc.) that can make this quick are acceptable as long as they are from well maintained code bases.
The following can be used to create tests cases, and test_case
in this example has missing rows:
def create_synthetic_df(len_df, n_uniques: list[int]):
rng = np.random.default_rng(seed=2)
if len(n_uniques) > 10:
n_uniques = n_uniques[:10]
dct = {}
for col, n_unique in zip('abcdefghij', n_uniques):
dct[col] = rng.integers(0, n_unique, size=len_df)
return pd.DataFrame(dct)
n_uniques = (50, 3, 10, 10, 3)
test_case = create_synthetic_df(1000, n_uniques).groupby(['a', 'b', 'c']).agg(sum)
def my_func(grouped_df) -> np.ndarray:
"""Call the solution 'my_func'."""
...
# additional test cases, maybe not exhaustive
simple_case = my_func(grouped)
expected = np.array([
[[1, 0],
[1, 0]],
[[1, 0],
[np.nan, np.nan]]
])
assert simple_case.shape == (2, 2, 2)
assert np.allclose(simple_case, expected, equal_nan=True)
assert my_func(test_case).shape == (50, 3, 10, 2)
答案1
得分: 1
以下是您要的代码部分的翻译:
Fundamentally this is a reindexing operation. There are trickier ways to do this, and I have not profiled this code; this is the "unsurprising" approach.
The second approach uses lower-level Numpy but I don't know which one will be faster. They are tested to be equivalent.
import string
from typing import Sequence
import numpy as np
import pandas as pd
def create_synthetic_df(len_df: int, n_uniques: Sequence[int], seed: int = 2) -> pd.DataFrame:
rng = np.random.default_rng(seed=seed)
df = pd.DataFrame(
data=1 + rng.integers(low=0, high=n_uniques, size=(len_df, len(n_uniques))),
columns=tuple(string.ascii_lowercase[:len(n_uniques)]),
)
return df
def mi_reindex(df: pd.DataFrame, group_cols: list[str]) -> np.ndarray:
totals: pd.DataFrame = df.groupby(group_cols).sum()
uindex = [
totals.index.unique(level=level).sort_values()
for level in group_cols
]
full_index = pd.MultiIndex.from_product(iterables=uindex)
aligned = totals.reindex(full_index)
reshaped = aligned.values.reshape((
*(
u.size for u in uindex
),
totals.columns.size,
))
return reshaped
def np_unique(df: pd.DataFrame, group_cols: list[str]) -> np.ndarray:
totals = df.groupby(group_cols).sum()
uniques = [
np.unique(
ar=totals.index.get_level_values(col),
return_inverse=True,
)
for col in group_cols
]
dest = np.full(
shape=(
*(u.size for u in uniques),
totals.columns.size,
),
fill_value=np.nan,
)
idx = tuple(i for u, i in uniques) + (slice(None),)
dest[idx] = totals
return dest
def test() -> None:
simple_outputs = []
big_outputs = []
big_uniques = (50, 3, 10, 10, 3)
big_input = create_synthetic_df(1000, big_uniques)
simple_input = pd.DataFrame({
'a': [1, 1, 2],
'b': [0, 1, 0],
'c': [1, 1, 1],
'd': [0, 0, 0]
})
simple_output = np.array([
[[1, 0],
[1, 0]], # this is not 2
[[1, 0],
[np.nan, np.nan]]
])
for my_func in (mi_reindex, np_unique):
actual = my_func(simple_input, ['a', 'b'])
assert actual.shape == (2, 2, 2)
assert np.allclose(actual, simple_output, equal_nan=True)
simple_outputs.append(actual)
actual = my_func(big_input, ['a', 'b', 'c'])
assert actual.shape == (50, 3, 10, 2)
big_outputs.append(actual)
assert np.allclose(*simple_outputs, equal_nan=True)
assert np.allclose(*big_outputs, equal_nan=True)
if __name__ == '__main__':
test()
希望这对您有所帮助。如果您需要更多帮助,请随时告诉我。
英文:
Fundamentally this is a reindexing operation. There are trickier ways to do this, and I have not profiled this code; this is the "unsurprising" approach.
The second approach uses lower-level Numpy but I don't know which one will be faster. They are tested to be equivalent.
import string
from typing import Sequence
import numpy as np
import pandas as pd
def create_synthetic_df(len_df: int, n_uniques: Sequence[int], seed: int = 2) -> pd.DataFrame:
rng = np.random.default_rng(seed=seed)
df = pd.DataFrame(
data=1 + rng.integers(low=0, high=n_uniques, size=(len_df, len(n_uniques))),
columns=tuple(string.ascii_lowercase[:len(n_uniques)]),
)
return df
def mi_reindex(df: pd.DataFrame, group_cols: list[str]) -> np.ndarray:
totals: pd.DataFrame = df.groupby(group_cols).sum()
uindex = [
totals.index.unique(level=level).sort_values()
for level in group_cols
]
full_index = pd.MultiIndex.from_product(iterables=uindex)
aligned = totals.reindex(full_index)
reshaped = aligned.values.reshape((
*(
u.size for u in uindex
),
totals.columns.size,
))
return reshaped
def np_unique(df: pd.DataFrame, group_cols: list[str]) -> np.ndarray:
totals = df.groupby(group_cols).sum()
uniques = [
np.unique(
ar=totals.index.get_level_values(col),
return_inverse=True,
)
for col in group_cols
]
dest = np.full(
shape=(
*(u.size for u, idx in uniques),
totals.columns.size,
),
fill_value=np.nan,
)
idx = tuple(i for u, i in uniques) + (slice(None),)
dest[idx] = totals
return dest
def test() -> None:
simple_outputs = []
big_outputs = []
big_uniques = (50, 3, 10, 10, 3)
big_input = create_synthetic_df(1000, big_uniques)
simple_input = pd.DataFrame({
'a': [1, 1, 2],
'b': [0, 1, 0],
'c': [1, 1, 1],
'd': [0, 0, 0]
})
simple_output = np.array([
[[1, 0],
[1, 0]], # this is not 2
[[1, 0],
[np.nan, np.nan]]
])
for my_func in (mi_reindex, np_unique):
actual = my_func(simple_input, ['a', 'b'])
assert actual.shape == (2, 2, 2)
assert np.allclose(actual, simple_output, equal_nan=True)
simple_outputs.append(actual)
actual = my_func(big_input, ['a', 'b', 'c'])
assert actual.shape == (50, 3, 10, 2)
big_outputs.append(actual)
assert np.allclose(*simple_outputs, equal_nan=True)
assert np.allclose(*big_outputs, equal_nan=True)
if __name__ == '__main__':
test()
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论