英文:
Take random sample of rows from dataframe with grouping variables
问题
以下是代码部分的翻译:
我有一个数据框,结构如下:
dat <- tibble(
item_type = rep(1:36, each = 6),
condition1 = rep(c("a", "b", "c"), times = 72),
condition2 = rep(c("y", "z"), each = 3, times = 36),
) %>%
unite(unique, item_type, condition1, condition2, sep = "-", remove = TRUE)
看起来像这样:
# 一个 tibble: 216 × 4
unique item_type condition1 condition2
<chr> <int> <chr> <chr>
1 1-a-y 1 a y
2 1-b-y 1 b y
3 1-c-y 1 c y
4 1-a-z 1 a z
5 1-b-z 1 b z
6 1-c-z 1 c z
7 2-a-y 2 a y
8 2-b-y 2 b y
9 2-c-y 2 c y
10 2-a-z 2 a z
我想随机抽取36行数据。抽样应包括6个 condition1
按 condition2
组合的重复,而不重复 item_type
。
使用 slice_sample()
似乎可以得到我想要的子集:
set.seed(1)
dat %>%
slice_sample(n = 6, by = c("condition1", "condition2")) %>%
count(condition1, condition2)
condition1 condition2 n
1 a y 6
2 a z 6
3 b y 6
4 b z 6
5 c y 6
6 c z 6
但仔细检查后,我发现 item_type
被重复了。
set.seed(1)
dat %>%
slice_sample(n = 6, by = c("condition1", "condition2")) %>%
count(item_type) %>%
arrange(desc(n))
# 一个 tibble: 22 × 2
item_type n
<int> <int>
1 10 3
2 34 3
3 1 2
4 6 2
5 7 2
6 15 2
7 20 2
8 21 2
9 23 2
10 25 2
# … 还有更多行
换句话说,我希望从 item_type
中只获得唯一的抽样。是否可能使用 slice_sample()
实现这一点?
编辑
添加第二个示例的数据:
dat <- tibble(
item_type = rep(1:36, each = 3),
condition1 = rep(c("a", "b"), each = 54),
condition2 = rep(c("x", "y", "z"), times = 36),
) %>%
unite(unique, item_type, condition1, condition2, sep = "-", remove = TRUE)
看起来像这样:
# 一个 tibble: 108 × 4
unique item_type condition1 condition2
<chr> <int> <chr> <chr>
1 1-a-x 1 a x
2 1-a-y 1 a y
3 1-a-z 1 a z
4 2-a-x 2 a x
5 2-a-y 2 a y
6 2-a-z 2 a z
7 3-a-x 3 a x
8 3-a-y 3 a y
9 3-a-z 3 a z
10 4-a-x 4 a x
尝试进行抽样:
inner_join(
dat,
distinct(dat, condition1, condition2) %>%
uncount(n()) %>%
mutate(item_type = sample(n()))
)
这将生成一个长度为20的数据框,具有以下特点:
condition1 condition2 n
1 a x 4
2 a y 4
3 a z 4
4 b x 3
5 b y 4
6 b z 5
英文:
I have a dataframe with the following structure:
dat <- tibble(
item_type = rep(1:36, each = 6),
condition1 = rep(c("a", "b", "c"), times = 72),
condition2 = rep(c("y", "z"), each = 3, times = 36),
) %>%
unite(unique, item_type, condition1, condition2, sep = "-", remove = F)
which looks like this:
# A tibble: 216 × 4
unique item_type condition1 condition2
<chr> <int> <chr> <chr>
1 1-a-y 1 a y
2 1-b-y 1 b y
3 1-c-y 1 c y
4 1-a-z 1 a z
5 1-b-z 1 b z
6 1-c-z 1 c z
7 2-a-y 2 a y
8 2-b-y 2 b y
9 2-c-y 2 c y
10 2-a-z 2 a z
I would like to take a random sample of 36 rows. The sample should include 6 repetitions of the condition1
by condition2
combinations without repeating item_type
.
Using slice_sample()
it seems I can get the subset I want...
set.seed(1)
dat %>%
slice_sample(n = 6, by = c("condition1", "condition2")) %>%
count(condition1, condition2)
condition1 condition2 n
<chr> <chr> <int>
1 a y 6
2 a z 6
3 b y 6
4 b z 6
5 c y 6
6 c z 6
But on closer inspection I see that item_type
is repeated.
set.seed(1)
dat %>%
slice_sample(n = 6, by = c("condition1", "condition2")) %>%
count(item_type) %>%
arrange(desc(n))
# A tibble: 22 × 2
item_type n
<int> <int>
1 10 3
2 34 3
3 1 2
4 6 2
5 7 2
6 15 2
7 20 2
8 21 2
9 23 2
10 25 2
# … with 12 more rows
In other words, I would like only unique pulls overall from item_type
.
Is it possible to get slice_sample()
to do this?
EDIT
Adding second toy data example.
dat <- tibble(
item_type = rep(1:36, each = 3),
condition1 = rep(c("a", "b"), each = 54),
condition2 = rep(c("x", "y", "z"), times = 36),
) %>%
unite(unique, item_type, condition1, condition2, sep = "-", remove = F)
Which looks like this:
# A tibble: 108 × 4
unique item_type condition1 condition2
<chr> <int> <chr> <chr>
1 1-a-x 1 a x
2 1-a-y 1 a y
3 1-a-z 1 a z
4 2-a-x 2 a x
5 2-a-y 2 a y
6 2-a-z 2 a z
7 3-a-x 3 a x
8 3-a-y 3 a y
9 3-a-z 3 a z
10 4-a-x 4 a x
Attempt to sample:
inner_join(
dat,
distinct(dat,condition1, condition2) %>%
uncount(n()) %>%
mutate(item_type = sample(n()))
)
Which produces a dataframe of length 20 with the following characteristics:
condition1 condition2 n
<chr> <chr> <int>
1 a x 4
2 a y 4
3 a z 4
4 b x 3
5 b y 4
6 b z 5
答案1
得分: 2
以下是您要翻译的代码部分:
You could do this:
inner_join(
dat,
distinct(dat,condition1, condition2) %>%
uncount(n()) %>%
mutate(item_type=sample(n())),
)
Output:
A tibble: 36 × 4
unique item_type condition1 condition2
<chr> <int> <chr> <chr>
1 1-b-z 1 b z
2 2-a-z 2 a z
3 3-c-y 3 c y
4 4-c-z 4 c z
5 5-b-z 5 b z
6 6-a-y 6 a y
7 7-c-y 7 c y
8 8-a-y 8 a y
9 9-a-y 9 a y
10 10-c-z 10 c z
… with 26 more rows
On the second dataset, you need to get the min/max range to sample:
```R
inner_join(
dat,
distinct(dat,condition1, condition2) %>%
uncount(n()) %>%
inner_join(dat %>% group_by(condition1, condition2) %>% summarize(imin = min(item_type), imax=max(item_type), .groups="drop")) %>%
group_by(condition1) %>%
mutate(item_type = sample(imin[1]:imax[1],size = n())) %>%
ungroup() %>%
select(-c(imin:imax))
)
Output:
# A tibble: 36 × 4
unique item_type condition1 condition2
<chr> <int> <chr> <chr>
1 1-a-y 1 a y
2 2-a-z 2 a z
3 3-a-z 3 a z
4 4-a-y 4 a y
5 5-a-z 5 a z
6 6-a-y 6 a y
7 7-a-x 7 a x
8 8-a-z 8 a z
9 9-a-y 9 a y
10 10-a-z 10 a z
# … with 26 more rows
英文:
You could do this:
inner_join(
dat,
distinct(dat,condition1, condition2) %>%
uncount(n()) %>%
mutate(item_type=sample(n())),
)
Output:
# A tibble: 36 × 4
unique item_type condition1 condition2
<chr> <int> <chr> <chr>
1 1-b-z 1 b z
2 2-a-z 2 a z
3 3-c-y 3 c y
4 4-c-z 4 c z
5 5-b-z 5 b z
6 6-a-y 6 a y
7 7-c-y 7 c y
8 8-a-y 8 a y
9 9-a-y 9 a y
10 10-c-z 10 c z
# … with 26 more rows
On the second dataset, you need to get the min/max range to sample:
inner_join(
dat,
distinct(dat,condition1, condition2) %>%
uncount(n()) %>%
inner_join(dat %>% group_by(condition1, condition2) %>% summarize(imin = min(item_type), imax=max(item_type), .groups="drop")) %>%
group_by(condition1) %>%
mutate(item_type = sample(imin[1]:imax[1],size = n())) %>%
ungroup() %>%
select(-c(imin:imax))
)
Output:
# A tibble: 36 × 4
unique item_type condition1 condition2
<chr> <int> <chr> <chr>
1 1-a-y 1 a y
2 2-a-z 2 a z
3 3-a-z 3 a z
4 4-a-y 4 a y
5 5-a-z 5 a z
6 6-a-y 6 a y
7 7-a-x 7 a x
8 8-a-z 8 a z
9 9-a-y 9 a y
10 10-a-z 10 a z
# … with 26 more rows
答案2
得分: 1
以下是您要求的代码翻译:
Try
library(nplyr)
library(dplyr)
library(tidyr)
dat %>%
nest(data = -item_type) %>%
nest_slice_sample(data, n = 1) %>%
unnest(data)
-output
# A tibble: 36 × 4
item_type unique condition1 condition2
<int> <chr> <chr> <chr>
1 1 1-c-z c z
2 2 2-b-z b z
3 3 3-b-y b y
4 4 4-c-y c y
5 5 5-c-z c z
6 6 6-b-z b z
7 7 7-a-z a z
8 8 8-c-z c z
9 9 9-b-y b y
10 10 10-a-y a y
# … with 26 more rows
Or perhaps we need
lst1 <- split(dat, dat[c("condition1", "condition2")], drop = TRUE)
lst2 <- vector('list', length(lst1))
item_type_rm <- numeric(0)
for(i in seq_along(lst1))
{
tmp <- lst1[[i]]
tmp1 <- tmp %>%
filter(!item_type %in% item_type_rm) %>%
slice_sample(n = 6)
item_type_rm <- c(item_type_rm, tmp1$item_type)
lst2[[i]] <- tmp1
}
out <- bind_rows(lst2)
out
# A tibble: 36 × 4
unique item_type condition1 condition2
<chr> <int> <chr> <chr>
1 17-a-x 17 a x
2 5-a-x 5 a x
3 9-a-x 9 a x
4 2-a-x 2 a x
5 7-a-x 7 a x
6 3-a-x 3 a x
7 31-b-x 31 b x
8 27-b-x 27 b x
9 36-b-x 36 b x
10 19-b-x 19 b x
# … with 26 more rows
> out %>% count(item_type) %>% pull(n)
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
请注意,这里只翻译了代码部分,没有包括注释和输出。
英文:
Try
library(nplyr)
library(dplyr)
library(tidyr)
dat %>%
nest(data = -item_type) %>%
nest_slice_sample(data, n = 1) %>%
unnest(data)
-output
# A tibble: 36 × 4
item_type unique condition1 condition2
<int> <chr> <chr> <chr>
1 1 1-c-z c z
2 2 2-b-z b z
3 3 3-b-y b y
4 4 4-c-y c y
5 5 5-c-z c z
6 6 6-b-z b z
7 7 7-a-z a z
8 8 8-c-z c z
9 9 9-b-y b y
10 10 10-a-y a y
# … with 26 more rows
Or perhaps we need
lst1 <- split(dat, dat[c("condition1", "condition2")], drop = TRUE)
lst2 <- vector('list', length(lst1))
item_type_rm <- numeric(0)
for(i in seq_along(lst1))
{
tmp <- lst1[[i]]
tmp1 <- tmp %>%
filter(!item_type %in% item_type_rm) %>%
slice_sample(n = 6)
item_type_rm <- c(item_type_rm, tmp1$item_type)
lst2[[i]] <- tmp1
}
out <- bind_rows(lst2)
out
# A tibble: 36 × 4
unique item_type condition1 condition2
<chr> <int> <chr> <chr>
1 17-a-x 17 a x
2 5-a-x 5 a x
3 9-a-x 9 a x
4 2-a-x 2 a x
5 7-a-x 7 a x
6 3-a-x 3 a x
7 31-b-x 31 b x
8 27-b-x 27 b x
9 36-b-x 36 b x
10 19-b-x 19 b x
# … with 26 more rows
> out %>% count(item_type) %>% pull(n)
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论