英文:
Remove random sample to make group proportions match
问题
我有一个包含区域数据的数据框,我想要使每个国家特定变量的比例相等。这是我的例子。
我有一个表格,其中包含了按性别划分的国家样本的详细信息。我想要删除样本,以使0和1相等。
> table(df$Gender, df$COUNTRY)
1 2 3
0 86 81 282
1 21 7 23
是否有任何包/函数可以删除等于零的值,以保持与等于1的值相匹配?
这将是期望的结果:
> table(df$Gender, df$COUNTRY)
1 2 3
0 21 7 23
1 21 7 23
如果还有其他更基本的方法可以实现这一点,也会很有帮助。例如,删除其中一个国家为1且性别为0的随机样本。然后我可以手动处理每个国家。
有人要求提供一个dput,下面是它:
df <- structure(list(Gender = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 1, 1, 0, 0), COUNTRY = c(2, 3, 2, 1, 3, 3, 3, 2, 2, 3, 3,
3, 3, 3, 3, 2, 3, 3, 1, 3, 3, 3, 3, 1, 2, 3, 2, 3, 1, 3, 2, 3,
3, 3, 2, 2, 3, 3, 3, 2, 3, 2, 2, 1, 3, 3, 3, 2, 2, 3, 1, 1, 2,
2, 1, 3, 3, 1, 2, 1, 3, 3, 3, 1, 1, 3, 3, 3, 1, 3, 2, 1, 3, 2,
3, 3, 2, 3, 3, 3, 3, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 1, 1, 1, 1, 3, 1, 2, 1, 3,
2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 1, 1, 3, 1, 1, 2,
2, 3, 3, 1, 2, 3, 3, 3, 2, 3, 3, 1, 3, 3, 1, 3, 1, 1, 3, 3, 2,
3, 1, 1, 1, 3, 3, 3, 2, 3, 3, 2, 3, 2, 1, 3, 2, 3, 1, 3, 2, 2,
2, 3, 3, 2, 1, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 2, 2,
1, 1, 3, 1, 1, 1, 3, 3, 1, 2, 1, 1, 1, 1, 3, 3, 3, 1, 3, 3, 2,
3, 3, 3, 3, 3, 1, 3, 3, 2, 1, 1, 2, 3, 2, 3, 3, 3, 2, 2, 3, 3,
3, 3, 1, 3, 2, 2, 1, 3, 2, 1, 3, 2, 3, 2, 3, 3, 2, 3, 2, 3, 3,
3, 1, 3, 2, 1, 1, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 1, 2, 3, 1, 2,
3, 2, 1, 2, 1, 3, 1, 3, 3, 3, 3, 3, 1, 3, 1, 1, 3, 1, 3, 1, 1,
3, 3, 1, 3, 1, 1, 1, 2, 3, 2, 2, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3,
3, 3, 3, 3, 2, 3, 1, 3, 3, 3, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1, 3, 3, 1, 2, 3, 1, 3, 3, 3, 1, 3, 1, 3, 3, 3, 1, 1, 3, 2, 3,
1, 3, 3, 3, 1, 2, 3, 3, 3, 3, 1, 3, 1, 3, 1, 1, 3, 3, 3, 3, 1,
3, 3, 3, 3, 3, 1, 1, 3, 3, 2, 3, 3, 3, 3, 1, 3, 3, 2, 3, 3, 1,
3, 3, 3, 2, 3, 1, 3, 3, 1, 3, 2, 1, 2, 3, 3, 3, 3, 3, 3, 2, 2,
2, 3, 3, 2, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 2, 1, 3, 3, 3, 3, 3,
3, 1, 3, 1, 2, 3, 3, 2, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3,
1, 1, 3, 3, 3, 1, 3, 3, 1, 3, 3, 3, 3, 3, 1, 3, 3, 1, 3, 3, 3,
3, 3, 2, 3, 2, 3)), row.names = c(NA, 500L), class = "data.frame")
请注意,我只会返回翻译好的部分,不会回答关于翻译的问题。
英文:
I have a dataframe which has regional data, I'd like to be able to make the proportions within each country equal for a specific variable. Here is my example.
I have a table has the details of the sample, for Country by Gender. I'd like to be able to remove sample, with aim that 0 and 1 are equivalent.
> table(df$Gender , df$COUNTRY)
1 2 3
0 86 81 282
1 21 7 23
Is there any package/function that will remove the values equal to zero to keep enough to match those equal 1?
This would be the desired result
> table(df$Gender , df$COUNTRY)
1 2 3
0 21 7 23
1 21 7 23
If there is also a more basic way to do this, that will help too. For example, remove a random sample of 65 where df$Country=1 & df$Gender=0. I can then do each of country manually.
As someone requested a dput, here we go. The tables above have changed accordingly
df <- structure(list(Gender = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 1, 1, 0, 0), COUNTRY = c(2, 3, 2, 1, 3, 3, 3, 2, 2, 3, 3,
3, 3, 3, 3, 2, 3, 3, 1, 3, 3, 3, 3, 1, 2, 3, 2, 3, 1, 3, 2, 3,
3, 3, 2, 2, 3, 3, 3, 2, 3, 2, 2, 1, 3, 3, 3, 2, 2, 3, 1, 1, 2,
2, 1, 3, 3, 1, 2, 1, 3, 3, 3, 1, 1, 3, 3, 3, 1, 3, 2, 1, 3, 2,
3, 3, 2, 3, 3, 3, 3, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 1, 1, 1, 1, 3, 1, 2, 1, 3,
2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 1, 1, 3, 1, 1, 2,
2, 3, 3, 1, 2, 3, 3, 3, 2, 3, 3, 1, 3, 3, 1, 3, 1, 1, 3, 3, 2,
3, 1, 1, 1, 3, 3, 3, 2, 3, 3, 2, 3, 2, 1, 3, 2, 3, 1, 3, 2, 2,
2, 3, 3, 2, 1, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 2, 2,
1, 1, 3, 1, 1, 1, 3, 3, 1, 2, 1, 1, 1, 1, 3, 3, 3, 1, 3, 3, 2,
3, 3, 3, 3, 3, 1, 3, 3, 2, 1, 1, 2, 3, 2, 3, 3, 3, 2, 2, 3, 3,
3, 3, 1, 3, 2, 2, 1, 3, 2, 1, 3, 2, 3, 2, 3, 3, 2, 3, 2, 3, 3,
3, 1, 3, 2, 1, 1, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 1, 2, 3, 1, 2,
3, 2, 1, 2, 1, 3, 1, 3, 3, 3, 3, 3, 1, 3, 1, 1, 3, 1, 3, 1, 1,
3, 3, 1, 3, 1, 1, 1, 2, 3, 2, 2, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3,
3, 3, 3, 3, 2, 3, 1, 3, 3, 3, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1, 3, 3, 1, 2, 3, 1, 3, 3, 3, 1, 3, 1, 3, 3, 3, 1, 1, 3, 2, 3,
1, 3, 3, 3, 1, 2, 3, 3, 3, 3, 1, 3, 1, 3, 1, 1, 3, 3, 3, 3, 1,
3, 3, 3, 3, 3, 1, 1, 3, 3, 2, 3, 3, 3, 3, 1, 3, 3, 2, 3, 3, 1,
3, 3, 3, 2, 3, 1, 3, 3, 1, 3, 2, 1, 2, 3, 3, 3, 3, 3, 3, 2, 2,
2, 3, 3, 2, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 2, 1, 3, 3, 3, 3, 3,
3, 1, 3, 1, 2, 3, 3, 2, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3,
1, 1, 3, 3, 3, 1, 3, 3, 1, 3, 3, 3, 3, 3, 1, 3, 3, 1, 3, 3, 3,
3, 3, 2, 3, 2, 3)), row.names = c(NA, 500L), class = "data.frame")
答案1
得分: 4
这是一个基于R语言的方法。
首先,对数据进行分组并计算行差异。然后,从第一列为0且与每个表列名相等的索引中随机抽取与这些差异相同数量的行。这些行是要删除的行。
n <- 100L
set.seed(2023)
df1 <- data.frame(
a = sample(0:1, n, TRUE, prob = c(3, 1)/4),
b = sample(3, n, TRUE)
)
i <- df1$a == 0
table(df1) |
apply(2, diff) -> tmp
lapply(names(tmp), \(nm) {
j <- which(i & df1$b == nm)
sample(j, abs(tmp[nm]))
}) | unlist() -> tmp
df1[-tmp, ] | table()
#> b
#> a 1 2 3
#> 0 8 10 8
#> 1 8 10 8
rm(tmp) # tidy up
创建于2023-08-08,使用reprex v2.0.2
英文:
Here is a base R way.
First, table the data and compute the rows differences. Then, sample from the indices that have 1st column 0 and are equal to each table column name as many as those differences. These are the rows to remove.
n <- 100L
set.seed(2023)
df1 <- data.frame(
a = sample(0:1, n, TRUE, prob = c(3, 1)/4),
b = sample(3, n, TRUE)
)
i <- df1$a == 0
table(df1) |>
apply(2, diff) -> tmp
lapply(names(tmp), \(nm) {
j <- which(i & df1$b == nm)
sample(j, abs(tmp[nm]))
}) |> unlist() -> tmp
df1[-tmp, ] |> table()
#> b
#> a 1 2 3
#> 0 8 10 8
#> 1 8 10 8
rm(tmp) # tidy up
<sup>Created on 2023-08-08 with reprex v2.0.2</sup>
答案2
得分: 3
caret::downSample
函数非常适合这种情况:
首先是一些数据:
z1 <- data.frame(gender = c(rep(0, 9155), rep(1, 2628)),
country = 1)
z2 <- data.frame(gender = c(rep(0, 9335), rep(1, 1242)),
country = 2)
z3 <- data.frame(gender = c(rep(0, 31964), rep(1, 2720)),
country = 3)
z <- rbind(z1, z2, z3)
table(z)
#输出
country
gender 1 2 3
0 9155 9335 31964
1 2628 1242 2720
应用 down sampling,其中 y
参数是你想要使用的分类变量的交互作用:
set.seed(123) #使结果可重现
z_down <- caret::downSample(z, as.factor(interaction(z$gender, z$country)))
table(z_down[names(z_down) != "Class"]) #移除添加的列
#输出
country
gender 1 2 3
0 1242 1242 1242
1 1242 1242 1242
这样可以使每个分类变量组合的行数与样本最少的组合(1242)相等。
如果不需要这样,可以按国家进行操作:
lapply(split(z, z$country), \(x) caret::downSample(x, as.factor(x$gender))) -> z_2
然后将数据框的列表合并为一个:
z_2 <- do.call(rbind, z_2)
并移除 caret 添加的列:
z_2 <- z_2[, names(z_2) != "Class"]
table( z_2)
#输出
country
gender 1 2 3
0 2628 1242 2720
1 2628 1242 2720
编辑:我有一些额外的时间,所以这里有一个根据你在评论中提到的比例混合的函数(因为 caret::downSample
不支持):
downSample_custm <- function(df,
factor,
by = rep(1, nrow(df)),
proportion = 1,
output = c("df", "index")){
output <- match.arg(output)
# 使用 by 参数将数据框拆分为列表,如果不使用 by 参数,它将使用整个数据框
splited <- split(data.frame(df,
.factor = as.factor(factor),
row = 1:nrow(df)), by)
lapply(splited, function(x){
# 获取少数类别的实例数
minClass <- min(table(x$.factor))
# 按类别拆分输入数据框
spl <- split(x, x$.factor)
# 遍历类别
lapply(spl, function(i){
# 类别为 i 的数据框
y <- nrow(i) #行数
if(proportion >= 1 && y <= ceiling(minClass * proportion)){
# 如果类别的实例数小于等于 minClass * proportion,则返回全部实例
return(i)
} else {
# 从类别中随机抽样 ceiling(minClass * proportion) 次
return(i[sample(seq_along(i[,1]), ceiling(minClass * proportion)),])
}
}) -> out1
out1 <- do.call(rbind, out1)
return(out1)
}) -> out2
out2 <- do.call(rbind, out2)
rownames(out2) <- NULL
if(output == "df"){
return(out2[, !colnames(out2) %in% c(".factor", "row")])
} else if (output == "index") {
return(sort(out2$row))
}
}
用法:
对所有类别进行 down sample:
z2 <- downSample_custm(z, factor = z$gender, by = z$country, proportion = 0.3)
table(z2)
#输出
country
gender 1 2 3
0 789 373 816
1 789 373 816
对过度表示的类别进行 down sample:
z3 <- downSample_custm(z, factor = z$gender, by = z$country, proportion = 1.5)
table(z3)
#输出
country
gender 1 2 3
0 3942 1863 4080
1 2628 1242 2720
返回行索引而不是 down sample 后的数据框:
z4 <- downSample_custm(z, factor = z$gender, by = z$country, proportion = 1, output = "index")
table(z[z4,])
#输出
country
gender 1 2 3
0 2628 1242 2720
1 2628 1242 2720
简单的 down sample(没有分组):
z5 <- downSample_custm(z, factor = z$gender, proportion = 1)
table(z5$gender)
#输出
0 1
6590 6590
英文:
The caret::downSample
is practically tailored for this:
Some data:
z1 <- data.frame(gender = c(rep(0, 9155), rep(1, 2628)),
country = 1)
z2 <- data.frame(gender = c(rep(0, 9335), rep(1, 1242)),
country = 2)
z3 <- data.frame(gender = c(rep(0, 31964), rep(1, 2720)),
country = 3)
z <- rbind(z1, z2, z3)
table(z)
#output
country
gender 1 2 3
0 9155 9335 31964
1 2628 1242 2720
Apply down sampling where the y
argument would be the interaction of the categorical variables you wish to use:
set.seed(123) #make it reproducible
z_down <- caret::downSample(z, as.factor(interaction(z$gender, z$country)))
table(z_down[names(z_down) != "Class"]) #remove added column
#output
country
gender 1 2 3
0 1242 1242 1242
1 1242 1242 1242
This gives equal number of rows per categorical variable combo as in the combination with the lowest number of samples (1242).
If this is not desired, one can do by country:
lapply(split(z, z$country), \(x) caret::downSample(x, as.factor(x$gender))) -> z_2
then just combine the list of data frames into one:
z_2 <- do.call(rbind, z_2)
and remove the column caret added:
z_2 <- z_2[, names(z_2) != "Class"]
table( z_2)
#output
country
gender 1 2 3
0 2628 1242 2720
1 2628 1242 2720
EDIT: I had some extra time so here is a function that does what you mention in the comment (proportion mix) since caret::downSample
does not:
downSample_custm <- function(df,
factor,
by = rep(1, nrow(df)),
proportion = 1,
output = c("df", "index")){
output <- match.arg(output)
# split data.frame into a list using the by argument,
# if by is not used it will just take the whole data.frame
splited <- split(data.frame(df,
.factor = as.factor(factor),
row = 1:nrow(df)), by)
lapply(splited, function(x){
#get the number of instances of the minority class
minClass <- min(table(x$.factor))
#split input data.frame by class
spl <- split(x, x$.factor)
#loop over classes
lapply(spl, function(i){
#data.frame with class i
y <- nrow(i) #number of rows
if(proportion >= 1 && y <= ceiling(minClass * proportion)){
#if class has less instances then minClass * proportion return it all
return(i)
} else {
#sample from class ceiling(minClass * proportion) times
return(i[sample(seq_along(i[,1]), ceiling(minClass * proportion)),])
}
}) -> out1
out1 <- do.call(rbind, out1)
return(out1)
}) -> out2
out2 <- do.call(rbind, out2)
rownames(out2) <- NULL
if(output == "df"){
return(out2[, !colnames(out2) %in% c(".factor", "row")])
} else if (output == "index") {
return(sort(out2$row))
}
}
Usage:
down sample all classes:
z2 <- downSample_custm(z, factor = z$gender, by = z$country, proportion = 0.3)
table(z2)
#output
country
gender 1 2 3
0 789 373 816
1 789 373 816
down sample classes which are over represented:
z3 <- downSample_custm(z, factor = z$gender, by = z$country, proportion = 1.5)
table(z3)
#output
country
gender 1 2 3
0 3942 1863 4080
1 2628 1242 2720
return row index instead of down sampled data.frame:
z4 <- downSample_custm(z, factor = z$gender, by = z$country, proportion = 1, output = "index")
table(z[z4,])
#output
country
gender 1 2 3
0 2628 1242 2720
1 2628 1242 2720
simple down sample (no by group):
z5 <- downSample_custm(z, factor = z$gender, proportion = 1)
table(z5$gender)
#output
0 1
6590 6590
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论