英文:
Polars Convert Back From Dummies
问题
# 在pandas中,我可以使用[`from_dummies`](https://pandas.pydata.org/docs/reference/api/pandas.from_dummies.html)方法来反转独热编码。在polars中似乎没有内置的方法来做到这一点。这里是一个基本的例子:
pl.DataFrame({
"col1_hi": [0,0,0,1,1],
"col1_med": [0,0,1,0,0],
"col1_lo": [1,1,0,0,0],
"col2_yes": [1,1,0,1,0],
"col2_no": [0,0,1,0,1],
})
反转to_dummies
操作应该得到类似这样的结果:
pl.DataFrame({
"col1": ["lo", "lo", "med", "hi", "hi"],
"col2": ["yes", "yes", "no", "yes", "no"],
})
我的第一反应是使用pivot。我该如何实现这个功能?
英文:
In pandas I can use the from_dummies
method to reverse one-hot encoding. There doesn't seem to be a built in method for this in polars. Here is a basic example:
pl.DataFrame({
"col1_hi": [0,0,0,1,1],
"col1_med": [0,0,1,0,0],
"col1_lo": [1,1,0,0,0],
"col2_yes": [1,1,0,1,0],
"col2_no": [0,0,1,0,1],
})
┌─────────┬──────────┬─────────┬──────────┬─────────┐
│ col1_hi ┆ col1_med ┆ col1_lo ┆ col2_yes ┆ col2_no │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════════╪══════════╪═════════╪══════════╪═════════╡
│ 0 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │
│ 0 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │
│ 0 ┆ 1 ┆ 0 ┆ 0 ┆ 1 │
│ 1 ┆ 0 ┆ 0 ┆ 1 ┆ 0 │
│ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │
└─────────┴──────────┴─────────┴──────────┴─────────┘
Reversing the to_dummies
operation should result in something like this:
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ str │
╞══════╪══════╡
│ lo ┆ yes │
│ lo ┆ yes │
│ med ┆ no │
│ hi ┆ yes │
│ hi ┆ no │
└──────┴──────┘
My first thought was to use a pivot. How could I go about implementing this functionality?
答案1
得分: 4
你可以利用 pl.coalesce
(df
.with_columns(
pl.when(pl.col(col) == 1)
.then(pl.lit(col).str.extract(r"([^_]+$)"))
.alias(col)
for col in df.columns)
.select(
pl.coalesce(pl.col(f"^{prefix}_.+$")).alias(prefix)
for prefix in dict.fromkeys(
col.rsplit("_", maxsplit=1)[0]
for col in df.columns
)
))
形状:(5,2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ str │
╞══════╪══════╡
│ lo ┆ yes │
│ lo ┆ yes │
│ med ┆ no │
│ hi ┆ yes │
│ hi ┆ no │
└──────┴──────┘
更新: @Rodalm's方法 更简洁:
def from_dummies(df, separator="_"):
col_exprs = {}
for col in df.columns:
name, value = col.rsplit(separator, maxsplit=1)
expr = pl.when(pl.col(col) == 1).then(value)
col_exprs.setdefault(name, []).append(expr)
return df.select(
pl.coalesce(exprs) # 保留每行的第一个非空表达式值
.alias(name)
for name, exprs in col_exprs.items()
)
英文:
You could utilize pl.coalesce
(df
.with_columns(
pl.when(pl.col(col) == 1)
.then(pl.lit(col).str.extract(r"([^_]+$)"))
.alias(col)
for col in df.columns)
.select(
pl.coalesce(pl.col(f"^{prefix}_.+$")).alias(prefix)
for prefix in dict.fromkeys(
col.rsplit("_", maxsplit=1)[0]
for col in df.columns
)
))
shape: (5, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ str │
╞══════╪══════╡
│ lo ┆ yes │
│ lo ┆ yes │
│ med ┆ no │
│ hi ┆ yes │
│ hi ┆ no │
└──────┴──────┘
Update: @Rodalm's approach is much neater:
def from_dummies(df, separator="_"):
col_exprs = {}
for col in df.columns:
name, value = col.rsplit(separator, maxsplit=1)
expr = pl.when(pl.col(col) == 1).then(value)
col_exprs.setdefault(name, []).append(expr)
return df.select(
pl.coalesce(exprs) # keep the first non-null expression value by row
.alias(name)
for name, exprs in col_exprs.items()
)
答案2
得分: 2
使用 pl.coalesce
的方法,类似于 @jqurious's answer:
from collections import defaultdict
import polars as pl
df = pl.DataFrame({
"col1_hi": [0,0,0,1,1],
"col1_med": [0,0,1,0,0],
"col1_lo": [1,1,0,0,0],
"col2_yes": [1,1,0,1,0],
"col2_no": [0,0,1,0,1],
})
def from_dummies(df, sep="_"):
col_exprs = defaultdict(list)
for col in df.columns:
name, value = col.split(sep)
expr = pl.when(pl.col(col) == 1).then(value) # null otherwise
col_exprs[name].append(expr)
res = df.select(**{
name: pl.coalesce(exprs) # keep the first non-null expression value by row
for name, exprs in col_exprs.items()
})
return res
或者是泛化 @warwick12's approach,使用多个 when 和 then 连接的方法:
def from_dummies(df, sep="_"):
col_exprs = {}
for col in df.columns:
name, value = col.split(sep)
if name not in col_exprs:
col_exprs[name] = pl.when(pl.col(col) == 1).then(value)
else:
col_exprs[name] = col_exprs[name].when(pl.col(col) == 1).then(value)
return df.select(**col_exprs)
输出:
>>> from_dummies(df)
shape: (5, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ str │
╞══════╪══════╡
│ lo ┆ yes │
│ lo ┆ yes │
│ med ┆ no │
│ hi ┆ yes │
│ hi ┆ no │
└──────┴──────┘
英文:
A similar approach to @jqurious's answer using pl.coalesce
:
from collections import defaultdict
import polars as pl
df = pl.DataFrame({
"col1_hi": [0,0,0,1,1],
"col1_med": [0,0,1,0,0],
"col1_lo": [1,1,0,0,0],
"col2_yes": [1,1,0,1,0],
"col2_no": [0,0,1,0,1],
})
def from_dummies(df, sep="_"):
col_exprs = defaultdict(list)
for col in df.columns:
name, value = col.split(sep)
expr = pl.when(pl.col(col) == 1).then(value) # null otherwise
col_exprs[name].append(expr)
res = df.select(**{
name: pl.coalesce(exprs) # keep the first non-null expression value by row
for name, exprs in col_exprs.items()
})
return res
Or generalizing @warwick12's approach using multiple when and thens chained:
def from_dummies(df, sep="_"):
col_exprs = {}
for col in df.columns:
name, value = col.split(sep)
if name not in col_exprs:
col_exprs[name] = pl.when(pl.col(col) == 1).then(value)
else:
col_exprs[name] = col_exprs[name].when(pl.col(col) == 1).then(value)
return df.select(**col_exprs)
Output:
>>> from_dummies(df)
shape: (5, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ str │
╞══════╪══════╡
│ lo ┆ yes │
│ lo ┆ yes │
│ med ┆ no │
│ hi ┆ yes │
│ hi ┆ no │
└──────┴──────┘
答案3
得分: 1
你可以使用 pl.when()、pl.col() 和 pl.lit() 方法将包含虚拟变量的 Polars DataFrame 转换回原始格式。这将每列的虚拟变量映射回其原始值。
# 创建虚拟变量的 DataFrame
df = pl.DataFrame({
"col1_hi": [0,0,0,1,1],
"col1_med": [0,0,1,0,0],
"col1_lo": [1,1,0,0,0],
"col2_yes": [1,1,0,1,0],
"col2_no": [0,0,1,0,1],
})
# 将虚拟变量映射回原始值
df = df.select([
pl.when(pl.col("col1_hi") == 1).then(pl.lit("hi"))
.when(pl.col("col1_med") == 1).then(pl.lit("med")).otherwise("lo").alias("col1"),
pl.when(pl.col("col2_yes") == 1).then(pl.lit("yes")).otherwise("no").alias("col2")
])
# 显示原始 DataFrame
print(df)
英文:
You can use the pl.when(), pl.col() and pl.lit() methods to convert a polars DataFrame with dummy variables back to the original format. This map's each column's dummies back to their original values.
import polars as pl
# Create dummy variable DataFrame
df = pl.DataFrame({
"col1_hi": [0,0,0,1,1],
"col1_med": [0,0,1,0,0],
"col1_lo": [1,1,0,0,0],
"col2_yes": [1,1,0,1,0],
"col2_no": [0,0,1,0,1],
})
# Map dummies back to original values
df = df.select([
pl.when(pl.col("col1_hi") == 1).then(pl.lit("hi"))
.when(pl.col("col1_med") == 1).then(pl.lit("med")).otherwise("lo").alias("col1"),
pl.when(pl.col("col2_yes") == 1).then(pl.lit("yes")).otherwise("no").alias("col2")
])
# Display original DataFrame
print(df)
Output:
shape: (5, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ str │
╞══════╪══════╡
│ lo ┆ yes │
│ lo ┆ yes │
│ med ┆ no │
│ hi ┆ yes │
│ hi ┆ no │
└──────┴──────┘
答案4
得分: 1
你可以这样进行融合/拆分/过滤/旋转:
df \
.with_row_count("i") \
.melt('i') \
.with_columns(pl.col('variable').str.split('_')) \
.with_columns(col=pl.col('variable').arr.first(), val=pl.col('variable').arr.last()) \
.filter(pl.col('value')==1) \
.pivot('val','i','col') \
.sort('i').drop('i')
英文:
You can do a melt/split/filter/pivot like this:
df \
.with_row_count("i") \
.melt('i') \
.with_columns(pl.col('variable').str.split('_')) \
.with_columns(col=pl.col('variable').arr.first(), val=pl.col('variable').arr.last()) \
.filter(pl.col('value')==1) \
.pivot('val','i','col') \
.sort('i').drop('i')
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论