英文:
How do I create a df in r composed of grouped or nested counts?
问题
这是你要翻译的内容:
Edited to add: Sample data is now present--thanks to @LMc
Hi, folks. I've got a dataframe like this, filtered from IMDb:
> structure(list(tconst = c("tt0003419", "tt0003419", "tt0004013",
"tt0005231", "tt0005231", "tt0005615", "tt0005615", "tt0005772",
"tt0005951", "tt0005951", "tt0006434", "tt0006434", "tt0006554",
"tt0006820", "tt0007111", "tt0008826", "tt0010323", "tt0010323",
"tt0010323", "tt0010323"), primaryTitle = c("The Student of Prague",
"The Student of Prague", "The Ghost Breaker", "The Hound of the Baskervilles",
"The Hound of the Baskervilles", "Life Without Soul", "Life Without Soul",
"Mortmain", "Satan's Rhapsody", "Satan's Rhapsody", "Black Orchids",
"Black Orchids", "The Crimson Stain Mystery", "Homunculus, 1. Teil",
"A Night of Horror", "Alraune", "The Cabinet of Dr. Caligari",
"The Cabinet of Dr. Caligari", "The Cabinet of Dr. Caligari",
"The Cabinet of Dr. Caligari"), startYear = structure(c(-20819,
-20819, -20454, -20089, -20089, -20089, -20089, -20089, -19358,
-19358, -19358, -19358, -19724, -19724, -19358, -18628, -18263,
-18263, -18263, -18263), class = "Date"), runtimeMinutes = c("85",
"85", "60", "50", "50", "70", "70", "\\N", "55", "55", "50",
"50", "\\N", "69", "56", "80", "76", "76", "76", "76"), decade = c(1910,
1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910,
1910, 1910, 1910, 1910, 1920, 1920, 1920, 1920), genre = c("Drama",
"Fantasy", "Adventure", "Mystery", "Crime", "Drama", "Sci-Fi",
"Drama", "Fantasy", "Drama", "Drama", "Drama", "Mystery", "Sci-Fi",
"Drama", "Sci-Fi", "Thriller", "Mystery", "Mystery", "Thriller"
), rating = c(6.5, 6.5, 5.2, 3.1, 3.1, 6.6, 6.6, 5.8, 6.8, 6.8,
4.8, 4.8, 6.9, 6.1, 6.1, 5.5, 8.1, 8.1, 8.1, 8.1), numVotes = c(2063,
2063, 36, 40, 40, 53, 53, 23, 719, 719, 18, 18, 18, 91, 20, 51,
62119, 62119, 62119, 62119)), row names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
以下是翻译好的内容:
编辑后附加: 现在有示例数据--感谢@LMc
嗨,大家好。我有一个来自IMDb的数据框,如下所示:
> 结构(列表(tconst = c(“tt0003419”,“tt0003419”,“tt0004013”,
“tt0005231”,“tt0005231”,“tt0005615”,“tt0005615”,“tt0005772”,
“tt0005951”,“tt0005951”,“tt0006434”,“tt0006434”,“tt0006554”,
“tt0006820”,“tt0007111”,“tt0008826”,“tt0010323”,“tt0010323”,
“tt0010323”,“tt0010323”),primaryTitle = c(“The Student of Prague”,
“The Student of Prague”,“The Ghost Breaker”,“The Hound of the Baskervilles”,
“The Hound of the Baskervilles”,“Life Without Soul”,“Life Without Soul”,
“Mortmain”,“Satan's Rhapsody”,“Satan's Rhapsody”,“Black Orchids”,
“Black Orchids”,“The Crimson Stain Mystery”,“Homunculus, 1. Teil”,
“A Night of Horror”,“Alraune”,“The Cabinet of Dr. Caligari”,
“The Cabinet of Dr. Caligari”,“The Cabinet of Dr. Caligari”,
“The Cabinet of Dr. Caligari”),startYear = 结构(c(-20819,
-20819,-20454,-20089,-20089,-20089,-20089,-20089,-19358,
-19358,-19358,-19358,-19724,-19724,-19358,-18628,-18263,
-18263,-18263,-18263),class = “Date”),runtimeMinutes = c(“85”,
“85”,“60”,“50”,“50”,“70”,“70”,“\N”,“55”,“55”,“50”,
“50”,“\N”,“69”,“56”,“80”,“76”,“76”,“76”,“76”),十年 = c(1910,
1910,1910,1910,1910,1910,1910,1910,1910,1910,1910,1910,
1910,1910,1910,1910,1920,1920,1920,1920),genre = c(“戏剧”,
“幻想”,“冒险”,“神秘”,“犯罪”,“戏剧”,“科幻”,
“戏剧”,“幻想”,“戏剧”,“戏剧”,“戏剧”,“神秘”,“科幻”,
“戏剧
<details>
<summary>英文:</summary>
Edited to add: Sample data is now present--thanks to @LMc
Hi, folks. I've got a dataframe like this, filtered from IMDb:
> structure(list(tconst = c("tt0003419", "tt0003419", "tt0004013",
"tt0005231", "tt0005231", "tt0005615", "tt0005615", "tt0005772",
"tt0005951", "tt0005951", "tt0006434", "tt0006434", "tt0006554",
"tt0006820", "tt0007111", "tt0008826", "tt0010323", "tt0010323",
"tt0010323", "tt0010323"), primaryTitle = c("The Student of Prague",
"The Student of Prague", "The Ghost Breaker", "The Hound of the Baskervilles",
"The Hound of the Baskervilles", "Life Without Soul", "Life Without Soul",
"Mortmain", "Satan's Rhapsody", "Satan's Rhapsody", "Black Orchids",
"Black Orchids", "The Crimson Stain Mystery", "Homunculus, 1. Teil",
"A Night of Horror", "Alraune", "The Cabinet of Dr. Caligari",
"The Cabinet of Dr. Caligari", "The Cabinet of Dr. Caligari",
"The Cabinet of Dr. Caligari"), startYear = structure(c(-20819,
-20819, -20454, -20089, -20089, -20089, -20089, -20089, -19358,
-19358, -19358, -19358, -19724, -19724, -19358, -18628, -18263,
-18263, -18263, -18263), class = "Date"), runtimeMinutes = c("85",
"85", "60", "50", "50", "70", "70", "\\N", "55", "55", "50",
"50", "\\N", "69", "56", "80", "76", "76", "76", "76"), decade = c(1910,
1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910,
1910, 1910, 1910, 1910, 1920, 1920, 1920, 1920), genre = c("Drama",
"Fantasy", "Adventure", "Mystery", "Crime", "Drama", "Sci-Fi",
"Drama", "Fantasy", "Drama", "Drama", "Drama", "Mystery", "Sci-Fi",
"Drama", "Sci-Fi", "Thriller", "Mystery", "Mystery", "Thriller"
), rating = c(6.5, 6.5, 5.2, 3.1, 3.1, 6.6, 6.6, 5.8, 6.8, 6.8,
4.8, 4.8, 6.9, 6.1, 6.1, 5.5, 8.1, 8.1, 8.1, 8.1), numVotes = c(2063,
2063, 36, 40, 40, 53, 53, 23, 719, 719, 18, 18, 18, 91, 20, 51,
62119, 62119, 62119, 62119)), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
What I want to get is a dataframe of counts sorted by decade and genre, something like this:
| decade | genre | count |
|--------|---------|-------|
|1910 | Drama | 15 |
|1920 | Drama | 27 |
|1930 | Drama | 32 |
|... | ... | ... |
|1910 | Fantasy | 12 |
|1920 | Fantasy | 23 |
|1930 | Drama | 41 |
|... | ... | ... |
...and so on, through each group. I've tried several things, including this:
subgenres %>%
group_by(decade,genre) %>%
summarise(count=n())
But I get this error:
> subgenres %>%
> group_by(decade) %>%
> summarise(count=n())
> Error: 'format_error' is not an exported object from 'namespace:cli'
> Error in count(., decade, genre) : object 'decade' not found
I can count each each subgenres$genre, but I don't know how to nest groups and I'm not sure what my keyword search should be to do the research myself. I thought it was a type problem, but changes there didn't seem to work, either.
tibble [6,809 × 8] (S3: tbl_df/tbl/data.frame)
$ tconst : chr [1:6809] "tt0003419" "tt0003419" "tt0004013" "tt0005231" ...
$ primaryTitle : chr [1:6809] "The Student of Prague" "The Student of Prague" "The Ghost Breaker" "The Hound of the Baskervilles" ...
$ startYear : Date[1:6809], format: "1913-01-01" "1913-01-01" "1914-01-01" "1915-01-01" ...
$ runtimeMinutes: chr [1:6809] "85" "85" "60" "50" ...
$ decade : num [1:6809] 1910 1910 1910 1910 1910 1910 1910 1910 1910 1910 ...
$ genre : chr [1:6809] "Drama" "Fantasy" "Adventure" "Mystery" ...
$ rating : num [1:6809] 6.5 6.5 5.2 3.1 3.1 6.6 6.6 5.8 6.8 6.8 ...
$ numVotes : num [1:6809] 2063 2063 36 40 40 ...
Any insight is greatly appreciated!
</details>
# 答案1
**得分**: 1
以下是翻译好的代码部分:
```R
library(dplyr)
subgenres %>%
group_by(decade, genre) %>%
summarise(count=n())
# dplyr >= 1.1.0
subgenres %>%
summarise(count=n(), .by = c(decade, genre))
当我将 +
插入代码中,我能够在一定程度上复制您的错误:
df %>%
group_by(decade, genre) %>%
summarise(count=n())
Error in group_by(decade, genre) : object 'decade' not found
希望这些翻译能对您有所帮助。
英文:
Using your posted data and code the following both work:
library(dplyr)
subgenres %>%
group_by(decade,genre) %>%
summarise(count=n())
# dplyr >= 1.1.0
subgenres %>%
summarise(count=n(), .by = c(decade, genre))
When I insert the +
into the code, I am able to somewhat replicate your error:
df %>% +
group_by(decade,genre) %>% +
summarise(count=n())
Error in group_by(decade, genre) : object 'decade' not found
Output
decade genre count
<dbl> <chr> <int>
1 1910 Drama 7
2 1910 Fantasy 2
3 1910 Adventure 1
4 1910 Mystery 2
5 1910 Crime 1
6 1910 Sci-Fi 3
7 1920 Thriller 2
8 1920 Mystery 2
Data
structure(list(tconst = c("tt0003419", "tt0003419", "tt0004013",
"tt0005231", "tt0005231", "tt0005615", "tt0005615", "tt0005772",
"tt0005951", "tt0005951", "tt0006434", "tt0006434", "tt0006554",
"tt0006820", "tt0007111", "tt0008826", "tt0010323", "tt0010323",
"tt0010323", "tt0010323"), primaryTitle = c("The Student of Prague",
"The Student of Prague", "The Ghost Breaker", "The Hound of the Baskervilles",
"The Hound of the Baskervilles", "Life Without Soul", "Life Without Soul",
"Mortmain", "Satan's Rhapsody", "Satan's Rhapsody", "Black Orchids",
"Black Orchids", "The Crimson Stain Mystery", "Homunculus, 1. Teil",
"A Night of Horror", "Alraune", "The Cabinet of Dr. Caligari",
"The Cabinet of Dr. Caligari", "The Cabinet of Dr. Caligari",
"The Cabinet of Dr. Caligari"), startYear = structure(c(-20819,
-20819, -20454, -20089, -20089, -20089, -20089, -20089, -19358,
-19358, -19358, -19358, -19724, -19724, -19358, -18628, -18263,
-18263, -18263, -18263), class = "Date"), runtimeMinutes = c("85",
"85", "60", "50", "50", "70", "70", "\\N", "55", "55", "50",
"50", "\\N", "69", "56", "80", "76", "76", "76", "76"), decade = c(1910,
1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910,
1910, 1910, 1910, 1910, 1920, 1920, 1920, 1920), genre = c("Drama",
"Fantasy", "Adventure", "Mystery", "Crime", "Drama", "Sci-Fi",
"Drama", "Fantasy", "Drama", "Drama", "Drama", "Mystery", "Sci-Fi",
"Drama", "Sci-Fi", "Thriller", "Mystery", "Mystery", "Thriller"
), rating = c(6.5, 6.5, 5.2, 3.1, 3.1, 6.6, 6.6, 5.8, 6.8, 6.8,
4.8, 4.8, 6.9, 6.1, 6.1, 5.5, 8.1, 8.1, 8.1, 8.1), numVotes = c(2063,
2063, 36, 40, 40, 53, 53, 23, 719, 719, 18, 18, 18, 91, 20, 51,
62119, 62119, 62119, 62119)), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论