英文:
How to split character column at divider and create new binary columns?
问题
我有一个名为df_cat
的数据框,看起来像这样:
dput(df_cat)
结构(list(Name = c("Name1", "Name2", "Name3", "Name4", "Name5",
"Name6"), Category = c("App", "App, Comms_2", "App, Comms, Tech",
"App, Tech", "Comms, Tech", "BS, Tech")), class = "data.frame", row.names = c(NA,
-6L))
我想要将Category
列以**","**为分隔符拆分,并为每个输入创建新列,前面带有原始列名。这些列应该是二进制的,指示一行是否包含这个类别(1)或不包含(0):
Name Category_App Category_Comms_2 Category_Tech Category_BS
Name1 1 0 0 0
Name2 1 1 0 0
Name3 1 1 1 0
Name4 1 0 1 0
Name5 0 1 1 0
Name6 0 0 1 1
您知道我如何在一个数据集中实现这个目标吗?该数据集有超过23k行和100多个不同的类别。
英文:
I have a dataframe df_cat
looking like this:
dput(df_cat)
structure(list(Name = c("Name1", "Name2", "Name3", "Name4", "Name5",
"Name6"), Category = c("App", "App, Comms_2", "App, Comms, Tech",
"App, Tech", "Comms, Tech", "BS, Tech")), class = "data.frame", row.names = c(NA,
-6L))
I want to split column Category
at the divider "," and create new columns for each input with the original column name in front. These columns should be binary and indicate, whether a row contains this category (1) or not (0):
Name Category_App Category_Comms_2 Category_Tech Category_BS
Name1 1 0 0 0
Name2 1 1 0 0
Name3 1 1 1 0
Name4 1 0 1 0
Name5 0 1 1 0
Name6 0 0 1 1
Any idea how I can accomplish this in a dataset, that has over 23k rows and over 100 different categories?
答案1
得分: 3
可以使用 separate_rows
和 pivot_wider
轻松实现:
library(tidyverse)
df %>%
mutate(row = row_number()) %>%
separate_rows(Category) %>%
pivot_wider(names_from = Category, values_from = Category,
names_prefix = "Category_",
values_fn = function(x) 1, values_fill = 0) %>%
select(-row)
# 一个 tibble: 6 × 5
Name Category_App Category_Comms Category_Tech Category_BS
<chr> <dbl> <dbl> <dbl> <dbl>
1 Name1 1 0 0 0
2 Name2 1 1 0 0
3 Name3 1 1 1 0
4 Name4 1 0 1 0
5 Name5 0 1 1 0
6 Name6 0 0 1 1
如果要保留原始 category
列,请使用 bind_cols
添加:
df %>%
mutate(row = row_number()) %>%
separate_rows(Category) %>%
pivot_wider(names_from = Category, values_from = Category,
names_prefix = "Category_",
values_fn = function(x) 1, values_fill = 0) %>%
select(-row) %>%
bind_cols(., df %>% select(Category))
# 一个 tibble: 6 × 6
Name Category_App Category_Comms Category_Tech Category_BS Category
<chr> <dbl> <dbl> <dbl> <dbl> <chr>
1 Name1 1 0 0 0 App
2 Name2 1 1 0 0 App, Comms
3 Name3 1 1 1 0 App, Comms, Tech
4 Name4 1 0 1 0 App, Tech
5 Name5 0 1 1 0 Comms, Tech
6 Name6 0 0 1 1 BS, Tech
英文:
This can be done easily with separate_rows
and pivot_wider
:
library(tidyverse)
df %>%
mutate(row = row_number()) %>%
separate_rows(Category) %>%
pivot_wider(names_from = Category, values_from = Category,
names_prefix = "Category_",
values_fn = function(x) 1, values_fill = 0) %>%
select(-row)
# A tibble: 6 × 5
Name Category_App Category_Comms Category_Tech Category_BS
<chr> <dbl> <dbl> <dbl> <dbl>
1 Name1 1 0 0 0
2 Name2 1 1 0 0
3 Name3 1 1 1 0
4 Name4 1 0 1 0
5 Name5 0 1 1 0
6 Name6 0 0 1 1
If you want to retain the original category
column, add it with bind_cols
:
df %>%
mutate(row = row_number()) %>%
separate_rows(Category) %>%
pivot_wider(names_from = Category, values_from = Category,
names_prefix = "Category_",
values_fn = function(x) 1, values_fill = 0) %>%
select(-row) %>%
bind_cols(., df %>% select(Category))
# A tibble: 6 × 6
Name Category_App Category_Comms Category_Tech Category_BS Category
<chr> <dbl> <dbl> <dbl> <dbl> <chr>
1 Name1 1 0 0 0 App
2 Name2 1 1 0 0 App, Comms
3 Name3 1 1 1 0 App, Comms, Tech
4 Name4 1 0 1 0 App, Tech
5 Name5 0 1 1 0 Comms, Tech
6 Name6 0 0 1 1 BS, Tech
答案2
得分: 0
如果您事先知道可能的类别,甚至可以硬编码它们:
英文:
If you know the possible categories up front, you could even just hard-code them:
library(tidyverse)
df <- tibble::tribble(
~Name, ~Category,
"Name1", "App",
"Name2", "App, Comms",
"Name3", "App, Comms, Tech",
"Name4", "App, Tech",
"Name5", "Comms, Tech",
"Name6", "BS, Tech"
)
df |>
mutate(
Category_App = str_detect(Category, "App"),
Category_Comms = str_detect(Category, "Comms"),
Category_Tech = str_detect(Category, "Tech"),
Category_BS = str_detect(Category, "BS")
)
#> # A tibble: 6 × 6
#> Name Category Category_App Category_Comms Category_Tech Category_BS
#> <chr> <chr> <lgl> <lgl> <lgl> <lgl>
#> 1 Name1 App TRUE FALSE FALSE FALSE
#> 2 Name2 App, Comms TRUE TRUE FALSE FALSE
#> 3 Name3 App, Comms, Tech TRUE TRUE TRUE FALSE
#> 4 Name4 App, Tech TRUE FALSE TRUE FALSE
#> 5 Name5 Comms, Tech FALSE TRUE TRUE FALSE
#> 6 Name6 BS, Tech FALSE FALSE TRUE TRUE
<sup>Created on 2023-03-20 with reprex v2.0.2</sup>
答案3
得分: 0
使用data.table
library(data.table)
setDT(df)
df[, unlist(strsplit(Category, "(,| )+")), by = Name
][, dcast(.SD, Name ~ paste0("Category_", V1), length)]
# Name Category_App Category_BS Category_Comms Category_Tech
# <char> <int> <int> <int> <int>
# 1: Name1 1 0 0 0
# 2: Name2 1 0 1 0
# 3: Name3 1 0 1 1
# 4: Name4 1 0 0 1
# 5: Name5 0 0 1 1
# 6: Name6 0 1 0 1
英文:
Using data.table
library(data.table)
setDT(df)
df[, unlist(strsplit(Category, "(,| )+")), by = Name
][, dcast(.SD, Name ~ paste0("Category_", V1), length)]
# Name Category_App Category_BS Category_Comms Category_Tech
# <char> <int> <int> <int> <int>
# 1: Name1 1 0 0 0
# 2: Name2 1 0 1 0
# 3: Name3 1 0 1 1
# 4: Name4 1 0 0 1
# 5: Name5 0 0 1 1
# 6: Name6 0 1 0 1
答案4
得分: 0
库(stringr)
库(magrittr)
# 获取所有可能类别的向量
all_categories <- df_cat$Category %>%
str_split(",") %>%
unlist() %>%
str_squish() %>%
unique()
# 遍历所有类别
for( category in all_categories){
# 为每个类别创建一列
# 然后检查Category列中是否存在该类别
df_cat[,category] <- str_detect(df_cat$Category,category) %>%
as.numeric()
}
英文:
library(stringr)
library(magrittr)
# get vector of all the possible categories
all_categories <- df_cat$Category %>%
str_split(",") %>% # split at comma
unlist() %>% # convert from list to vector
str_squish() %>% # remove excess white space
unique() # find unique values
# loop through all the categories
for( category in all_categories){
# create a column for each category
# Then check for the category in the Category column
df_cat[,category] <- str_detect(df_cat$Category,category) %>%
as.numeric() # convert TRUE/FALSE to 0/1
}
答案5
得分: 0
在基本R中,你可以这样做:
a <- table(stack(setNames(strsplit(df$Category, "[, ]+"), df$Name))[2:1])
data.frame(df, as.data.frame.matrix(a), row.names = NULL)
Name Category App BS Comms Tech
1 Name1 App 1 0 0 0
2 Name2 App, Comms 1 0 1 0
3 Name3 App, Comms, Tech 1 0 1 1
4 Name4 App, Tech 1 0 0 1
5 Name5 Comms, Tech 0 0 1 1
6 Name6 BS, Tech 0 1 0 1
英文:
in base R you could do:
a <- table(stack(setNames(strsplit(df$Category, "[, ]+"), df$Name))[2:1])
data.frame(df, as.data.frame.matrix(a), row.names = NULL)
Name Category App BS Comms Tech
1 Name1 App 1 0 0 0
2 Name2 App, Comms 1 0 1 0
3 Name3 App, Comms, Tech 1 0 1 1
4 Name4 App, Tech 1 0 0 1
5 Name5 Comms, Tech 0 0 1 1
6 Name6 BS, Tech 0 1 0 1
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论