英文:
R: Writing Functions for Nested Datasets
问题
我正在使用R编程语言进行工作。
我有以下数据集:
set.seed(123)
library(dplyr)
Patient_ID = 1:5000
gender <- c("Male","Female")
gender <- sample(gender, 5000, replace=TRUE, prob=c(0.45, 0.55))
gender <- as.factor(gender)
status <- c("Immigrant","Citizen")
status <- sample(status, 5000, replace=TRUE, prob=c(0.3, 0.7))
status <- as.factor(status )
height = rnorm(5000, 150, 10)
weight = rnorm(5000, 90, 10)
disease = sample(c(1, 0), 5000, replace = TRUE)
my_data = data.frame(Patient_ID, gender, status, height, weight,disease)
以下是一个代码,我从中选择所有男性公民 - 然后选择最高的25%男性公民,然后从这25%最高的男性公民中选择最重的25%男性公民,并计算他们中有多少人患有疾病:
part_1 = my_data[my_data$gender == "Male" & my_data$status == "Immigrant",]
part_1 = part_1 %>% arrange(desc(height))
limits = as.integer(seq(1, nrow(part_1), by = 0.25*nrow(part_1)))
limits = c(limits, nrow(part_1))
create_h <- function(part_1) {
limits <- as.integer(seq(1, nrow(part_1), by = 0.25 * nrow(part_1)))
limits <- c(limits, nrow(part_1))
h_list <- list()
for (i in 1:(length(limits) - 1)) {
h_list[[i]] <- part_1[limits[i]:(limits[i + 1] - 1), ]
}
return(h_list)
}
h_list <- create_h(part_1)
for (i in seq_along(h_list)) {
assign(paste0("h_", i), h_list[[i]])
}
part_2 = h_1 %>% arrange(desc(weight))
limits = as.integer(seq(1, nrow(part_2), by = 0.25*nrow(part_2)))
limits = c(limits, nrow(part_2))
create_w <- function(part_2) {
limits <- as.integer(seq(1, nrow(part_2), by = 0.25 * nrow(part_2)))
limits <- c(limits, nrow(part_2))
w_list <- list()
for (i in 1:(length(limits) - 1)) {
w_list[[i]] <- part_1[limits[i]:(limits[i + 1] - 1), ]
}
return(w_list)
}
w_list <- create_h(part_2)
for (i in seq_along(w_list)) {
assign(paste0("w_", i), w_list[[i]])
}
r1 = data.frame(gender = "male", status = "immigrant", min_height = min(w_1$height), max_height = max(w_2$height), min_weight = min(w_1$weight), max_weight = max(w_1$weight), disease_rate = mean(w_1$disease), total_count = nrow(w_1), disease_count = sum(w_1$disease == "1") )
结果看起来类似于这样:
gender status min_height max_height min_weight max_weight disease_rate total_count disease_count
1 male immigrant 157.7242 173.7773 94.77408 117.1924 0.6 40 24
我的问题是:我试图扩展此代码,以计算所有这些组合的疾病率。下面,我试图将这些组合表示为树状图(其中我的代码对应于“红框”):
(注意:这显然只是树的一小部分 - 完全绘制整个树几乎是不可能的)
有人可以向我展示如何使用我的一般代码结构为所有可能的组合创建结果r_1、r_2.... r_n,然后将它们附加到单个数据集中吗?是否有一种使用函数/循环的快速方法来实现这一点?
谢谢!
来源:https://www.smartdraw.com/software/tree-diagram-maker.htm
英文:
I am working with the R programming language.
I have the following dataset:
set.seed(123)
library(dplyr)
Patient_ID = 1:5000
gender <- c("Male","Female")
gender <- sample(gender, 5000, replace=TRUE, prob=c(0.45, 0.55))
gender <- as.factor(gender)
status <- c("Immigrant","Citizen")
status <- sample(status, 5000, replace=TRUE, prob=c(0.3, 0.7))
status <- as.factor(status )
height = rnorm(5000, 150, 10)
weight = rnorm(5000, 90, 10)
disease = sample(c(1, 0), 5000, replace = TRUE)
my_data = data.frame(Patient_ID, gender, status, height, weight,disease)
Below is a code in which I take all male citizens - then take the 25% tallest male citizens, of these 25% tallest male citizens I take the 25% heaviest male citizens, and calculate how many of them have the disease:
part_1 = my_data[my_data$gender == "Male" & my_data$status == "Immigrant",]
part_1 = part_1 %>% arrange(desc(height))
limits = as.integer(seq(1, nrow(part_1), by = 0.25*nrow(part_1)))
limits = c(limits, nrow(part_1))
create_h <- function(part_1) {
limits <- as.integer(seq(1, nrow(part_1), by = 0.25 * nrow(part_1)))
limits <- c(limits, nrow(part_1))
h_list <- list()
for (i in 1:(length(limits) - 1)) {
h_list[[i]] <- part_1[limits[i]:(limits[i + 1] - 1), ]
}
return(h_list)
}
h_list <- create_h(part_1)
for (i in seq_along(h_list)) {
assign(paste0("h_", i), h_list[[i]])
}
###########################
part_2 = h_1 %>% arrange(desc(weight))
limits = as.integer(seq(1, nrow(part_2), by = 0.25*nrow(part_2)))
limits = c(limits, nrow(part_2))
create_w <- function(part_2) {
limits <- as.integer(seq(1, nrow(part_2), by = 0.25 * nrow(part_2)))
limits <- c(limits, nrow(part_2))
w_list <- list()
for (i in 1:(length(limits) - 1)) {
w_list[[i]] <- part_1[limits[i]:(limits[i + 1] - 1), ]
}
return(w_list)
}
w_list <- create_h(part_2)
for (i in seq_along(w_list)) {
assign(paste0("w_", i), w_list[[i]])
}
##############################
r1 = data.frame(gender = "male", status = "immigrant", min_height = min(w_1$height), max_height = max(w_2$height), min_weight = min(w_1$weight), max_weight = max(w_1$weight), disease_rate = mean(w_1$disease), total_count = nrow(w_1), disease_count = sum(w_1$disease == "1") )
The result looks something like this:
gender status min_height max_height min_weight max_weight disease_rate total_count disease_count
1 male immigrant 157.7242 173.7773 94.77408 117.1924 0.6 40 24
My Question: I am trying to extend this code to calculate the disease rate for all such combinations. Below, I tried to represent this as a tree diagram (in which my code corresponds to the "red box"):
(note: this is obviously just a small part of the tree - it is almost impossible to fully draw the whole tree)
Can someone please show me how I take my general code structure and create results r_1, r_2.... r_n for all possible combinations in this tree - and then append them into a single dataset? Is there a quick way to do this with function/loop based approach?
Thanks!
Sources: https://www.smartdraw.com/software/tree-diagram-maker.htm
答案1
得分: 3
这是一个使用dplyr的解决方案:
set.seed(123)
library(dplyr, warn.conflicts = FALSE)
Patient_ID <- 1:5000
gender <- c("Male", "Female")
gender <- sample(gender, 5000, replace = TRUE, prob = c(0.45, 0.55))
gender <- as.factor(gender)
status <- c("Immigrant", "Citizen")
status <- sample(status, 5000, replace = TRUE, prob = c(0.3, 0.7))
status <- as.factor(status)
height <- rnorm(5000, 150, 10)
weight <- rnorm(5000, 90, 10)
disease <- sample(c(1, 0), 5000, replace = TRUE)
my_data <- data.frame(Patient_ID, gender, status, height, weight, disease)
my_data %>%
group_by(gender, status, ntile(height, 4), ntile(weight, 4)) %>%
summarise(
min_height = min(height),
max_height = max(height),
min_weight = min(weight),
max_weight = max(weight),
disease_rate = mean(disease),
disease_count = sum(disease),
total_count = n(),
.groups = "keep"
) %>%
ungroup() %>%
mutate(
height_group = factor(`ntile(height, 4)`,
levels = c(1, 2, 3, 4), labels = c("0-25%", "25-50%", "50-75%", "75-100%")
),
weight_group = factor(`ntile(weight, 4)`,
levels = c(1, 2, 3, 4), labels = c("0-25%", "25-50%", "50-75%", "75-100%")
)
) %>%
select(-starts_with("ntile"))
#> # A tibble: 64 × 11
#> gender status min_height max_height min_weight max_weight disease_rate
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Female Citizen 123. 143. 59.9 83.5 0.496
#> 2 Female Citizen 127. 143. 83.5 90.0 0.5
#> 3 Female Citizen 116. 143. 90.3 97.0 0.496
#> 4 Female Citizen 119. 143. 97.1 121. 0.558
#> 5 Female Citizen 143. 150. 59.0 83.5 0.483
#> 6 Female Citizen 143. 150. 83.5 90.0 0.508
#> 7 Female Citizen 143. 150. 90.1 96.9 0.510
#> 8 Female Citizen 143. 150. 97.1 119. 0.407
#> 9 Female Citizen 150. 157. 60.1 83.5 0.5
#> 10 Female Citizen 150. 157. 83.5 90.0 0.488
#> # ℹ 54 more rows
#> # ℹ 4 more variables: disease_count <dbl>, total_count <int>,
#> # height_group <fct>, weight_group <fct>
创建于2023年7月10日,使用reprex v2.0.2
英文:
Here's a dplyr solution:
set.seed(123)
library(dplyr, warn.conflicts = FALSE)
Patient_ID <- 1:5000
gender <- c("Male", "Female")
gender <- sample(gender, 5000, replace = TRUE, prob = c(0.45, 0.55))
gender <- as.factor(gender)
status <- c("Immigrant", "Citizen")
status <- sample(status, 5000, replace = TRUE, prob = c(0.3, 0.7))
status <- as.factor(status)
height <- rnorm(5000, 150, 10)
weight <- rnorm(5000, 90, 10)
disease <- sample(c(1, 0), 5000, replace = TRUE)
my_data <- data.frame(Patient_ID, gender, status, height, weight, disease)
my_data %>%
group_by(gender, status, ntile(height, 4), ntile(weight, 4)) %>%
summarise(
min_height = min(height),
max_height = max(height),
min_weight = min(weight),
max_weight = max(weight),
disease_rate = mean(disease),
disease_count = sum(disease),
total_count = n(),
.groups = "keep"
) %>%
ungroup() %>%
mutate(
height_group = factor(`ntile(height, 4)`,
levels = c(1, 2, 3, 4), labels = c("0-25%", "25-50%", "50-75%", "75-100%")
),
weight_group = factor(`ntile(weight, 4)`,
levels = c(1, 2, 3, 4), labels = c("0-25%", "25-50%", "50-75%", "75-100%")
)
) %>%
select(-starts_with("ntile"))
#> # A tibble: 64 × 11
#> gender status min_height max_height min_weight max_weight disease_rate
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Female Citizen 123. 143. 59.9 83.5 0.496
#> 2 Female Citizen 127. 143. 83.5 90.0 0.5
#> 3 Female Citizen 116. 143. 90.3 97.0 0.496
#> 4 Female Citizen 119. 143. 97.1 121. 0.558
#> 5 Female Citizen 143. 150. 59.0 83.5 0.483
#> 6 Female Citizen 143. 150. 83.5 90.0 0.508
#> 7 Female Citizen 143. 150. 90.1 96.9 0.510
#> 8 Female Citizen 143. 150. 97.1 119. 0.407
#> 9 Female Citizen 150. 157. 60.1 83.5 0.5
#> 10 Female Citizen 150. 157. 83.5 90.0 0.488
#> # ℹ 54 more rows
#> # ℹ 4 more variables: disease_count <dbl>, total_count <int>,
#> # height_group <fct>, weight_group <fct>
<sup>Created on 2023-07-10 with reprex v2.0.2</sup>
答案2
得分: 2
以下是您要翻译的内容:
This can be solved by first binning the heights and weights and then using table
and aggregate
.
# use 'cut' to bin height and weight, set the
# values to the quantiles' intervals
qnt <- quantile(my_data$height)
lbls <- paste(names(qnt)[-5], names(qnt)[-1], sep = "-")
my_data$height_quant <- cut(my_data$height, qnt, labels = lbls, include.lowest = TRUE)
my_data$height_quant <- paste(my_data$height_quant, "Tallest")
# only compute the quantile's cut points, the labels
# are built on the same lbls variable defined above
qnt <- quantile(my_data$weight)
my_data$weight_quant <- cut(my_data$weight, qnt, labels = lbls, include.lowest = TRUE)
my_data$weight_quant <- paste(my_data$weight_quant, "Heaviest")
cols <- c("gender", "status", "height_quant", "weight_quant", "disease")
tbl <- table(my_data[cols])
# compute proprotions table, don't show
# proportions(tbl)
ftbl <- ftable(tbl)
# compute proprotions table, don't show
# proportions(ftbl)
# coerce the tables to data.frames
df_tbl <- as.data.frame(tbl)
df_tbl_prop <- as.data.frame(proportions(tbl))
df_ftbl <- as.data.frame(ftbl)
df_ftbl_prop <- as.data.frame(proportions(ftbl))
# examples of results data.frames
# total counts
head(df_tbl)
#> gender status height_quant weight_quant disease Freq
#> 1 Female Citizen 0%-25% Tallest 0%-25% Heaviest 0 69
#> 2 Male Citizen 0%-25% Tallest 0%-25% Heaviest 0 61
#> 3 Female Immigrant 0%-25% Tallest 0%-25% Heaviest 0 22
#> 4 Male Immigrant 0%-25% Tallest 0%-25% Heaviest 0 26
#> 5 Female Citizen 25%-50% Tallest 0%-25% Heaviest 0 60
#> 6 Male Citizen 25%-50% Tallest 0%-25% Heaviest 0 44
# proportions of disease on total counts
head(df_ftbl_prop)
#> gender status height_quant weight_quant disease Freq
#> 1 Female Citizen 0%-25% Tallest 0%-25% Heaviest 0 0.0138
#> 2 Male Citizen 0%-25% Tallest 0%-25% Heaviest 0 0.0122
#> 3 Female Immigrant 0%-25% Tallest 0%-25% Heaviest 0 0.0044
#> 4 Male Immigrant 0%-25% Tallest 0%-25% Heaviest 0 0.0052
#> 5 Female Citizen 25%-50% Tallest 0%-25% Heaviest 0 0.0120
#> 6 Male Citizen 25%-50% Tallest 0%-25% Heaviest 0 0.0088
# now compute proportions of disease per gender,
# status and height and weight quantiles
agg <- aggregate(Freq ~ gender + status + height_quant + weight_quant, df_tbl, \(x) x/sum(x))
colnames(agg[[ncol(agg)]]) <- paste0("disease.", 0:1)
agg <- cbind(agg[-ncol(agg)], agg[[ncol(agg)]])
head(agg)
#> gender status height_quant weight_quant disease.0 disease.1
#> 1 Female Citizen 0%-25% Tallest 0%-25% Heaviest 0.5036496 0.4963504
#> 2 Male Citizen 0%-25% Tallest 0%-25% Heaviest 0.5495495 0.4504505
#> 3 Female Immigrant 0%-25% Tallest 0%-25% Heaviest 0.4782609 0.5217391
#> 4 Male Immigrant 0%-25% Tallest 0%-25% Heaviest 0.6842105 0.3157895
#> 5 Female Citizen 25%-50% Tallest 0%-25% Heaviest 0.5172414 0.4827586
#> 6 Male Citizen 25%-50% Tallest 0%-25% Heaviest 0.4313725 0.5686275
<sup>Created on 2023-07-10 with reprex v2.0.2</sup>
Edit
To include min and max of height and weight turned out to be more complicated than expected. Here is a full example, duplicating most of the code above.
# use 'cut' to bin height and weight, set the
# values to the quantiles' intervals
qnt <- quantile(my_data$height)
lbls <- paste(names(qnt)[-5], names(qnt)[-1], sep = "-")
my_data$height_quant <- cut(my_data$height, qnt, labels = lbls, include.lowest = TRUE)
my_data$height_quant <- paste(my_data$height_quant, "Tallest")
tmp <- aggregate(height ~ height_quant, my_data, min)
my_data <- merge(my_data, tmp, by = "height_quant", suffixes = c("", "_min"))
tmp <- aggregate(height ~ height_quant, my_data, max)
my_data <- merge(my_data, tmp, by = "height_quant", suffixes = c("", "_max"))
# only compute the quantile's cut points, the labels
# are built on the same lbls variable defined above
qnt <- quantile(my_data$weight)
my_data$weight_quant <- cut(my_data$weight, qnt, labels = lbls, include.lowest = TRUE)
my_data$weight_quant <- paste(my_data$weight_quant, "Heaviest")
tmp <- aggregate(weight ~ weight_quant, my_data, min)
my_data <- merge(my_data, tmp, by = "weight_quant", suffixes = c("", "_min"))
tmp <- aggregate(weight ~ weight_quant, my_data, max)
my_data <- merge(my_data, tmp, by = "weight_quant", suffixes = c("", "_max"))
rm(tmp)
cols <- c("gender", "status",
"height_quant", "height_min", "height_max",
"weight_quant", "weight_min", "weight_max",
"disease")
tbl <- table(my_data[cols])
# compute proprotions table, don't
<details>
<summary>英文:</summary>
This can be solved by first binning the heights and weights and then using `table` and `aggregate`.
``` r
# use 'cut' to bin height and weight, set the
# values to the quantiles' intervals
qnt <- quantile(my_data$height)
lbls <- paste(names(qnt)[-5], names(qnt)[-1], sep = "-")
my_data$height_quant <- cut(my_data$height, qnt, labels = lbls, include.lowest = TRUE)
my_data$height_quant <- paste(my_data$height_quant, "Tallest")
# only compute the quantile's cut points, the labels
# are built on the same lbls variable defined above
qnt <- quantile(my_data$weight)
my_data$weight_quant <- cut(my_data$weight, qnt, labels = lbls, include.lowest = TRUE)
my_data$weight_quant <- paste(my_data$weight_quant, "Heaviest")
cols <- c("gender", "status", "height_quant", "weight_quant", "disease")
tbl <- table(my_data[cols])
# compute proprotions table, don't show
# proportions(tbl)
ftbl <- ftable(tbl)
# compute proprotions table, don't show
# proportions(ftbl)
# coerce the tables to data.frames
df_tbl <- as.data.frame(tbl)
df_tbl_prop <- as.data.frame(proportions(tbl))
df_ftbl <- as.data.frame(ftbl)
df_ftbl_prop <- as.data.frame(proportions(ftbl))
# examples of results data.frames
# total counts
head(df_tbl)
#> gender status height_quant weight_quant disease Freq
#> 1 Female Citizen 0%-25% Tallest 0%-25% Heaviest 0 69
#> 2 Male Citizen 0%-25% Tallest 0%-25% Heaviest 0 61
#> 3 Female Immigrant 0%-25% Tallest 0%-25% Heaviest 0 22
#> 4 Male Immigrant 0%-25% Tallest 0%-25% Heaviest 0 26
#> 5 Female Citizen 25%-50% Tallest 0%-25% Heaviest 0 60
#> 6 Male Citizen 25%-50% Tallest 0%-25% Heaviest 0 44
# proportions of disease on total counts
head(df_ftbl_prop)
#> gender status height_quant weight_quant disease Freq
#> 1 Female Citizen 0%-25% Tallest 0%-25% Heaviest 0 0.0138
#> 2 Male Citizen 0%-25% Tallest 0%-25% Heaviest 0 0.0122
#> 3 Female Immigrant 0%-25% Tallest 0%-25% Heaviest 0 0.0044
#> 4 Male Immigrant 0%-25% Tallest 0%-25% Heaviest 0 0.0052
#> 5 Female Citizen 25%-50% Tallest 0%-25% Heaviest 0 0.0120
#> 6 Male Citizen 25%-50% Tallest 0%-25% Heaviest 0 0.0088
# now compute proportions of disease per gender,
# status and height and weight quantiles
agg <- aggregate(Freq ~ gender + status + height_quant + weight_quant, df_tbl, \(x) x/sum(x))
colnames(agg[[ncol(agg)]]) <- paste0("disease.", 0:1)
agg <- cbind(agg[-ncol(agg)], agg[[ncol(agg)]])
head(agg)
#> gender status height_quant weight_quant disease.0 disease.1
#> 1 Female Citizen 0%-25% Tallest 0%-25% Heaviest 0.5036496 0.4963504
#> 2 Male Citizen 0%-25% Tallest 0%-25% Heaviest 0.5495495 0.4504505
#> 3 Female Immigrant 0%-25% Tallest 0%-25% Heaviest 0.4782609 0.5217391
#> 4 Male Immigrant 0%-25% Tallest 0%-25% Heaviest 0.6842105 0.3157895
#> 5 Female Citizen 25%-50% Tallest 0%-25% Heaviest 0.5172414 0.4827586
#> 6 Male Citizen 25%-50% Tallest 0%-25% Heaviest 0.4313725 0.5686275
<sup>Created on 2023-07-10 with reprex v2.0.2</sup>
Edit
To include min and max of height and weight turned out to be more complicated than expected. Here is a full example, duplicating most of the code above. But getting the job done.
# use 'cut' to bin height and weight, set the
# values to the quantiles' intervals
qnt <- quantile(my_data$height)
lbls <- paste(names(qnt)[-5], names(qnt)[-1], sep = "-")
my_data$height_quant <- cut(my_data$height, qnt, labels = lbls, include.lowest = TRUE)
my_data$height_quant <- paste(my_data$height_quant, "Tallest")
tmp <- aggregate(height ~ height_quant, my_data, min)
my_data <- merge(my_data, tmp, by = "height_quant", suffixes = c("", "_min"))
tmp <- aggregate(height ~ height_quant, my_data, max)
my_data <- merge(my_data, tmp, by = "height_quant", suffixes = c("", "_max"))
# only compute the quantile's cut points, the labels
# are built on the same lbls variable defined above
qnt <- quantile(my_data$weight)
my_data$weight_quant <- cut(my_data$weight, qnt, labels = lbls, include.lowest = TRUE)
my_data$weight_quant <- paste(my_data$weight_quant, "Heaviest")
tmp <- aggregate(weight ~ weight_quant, my_data, min)
my_data <- merge(my_data, tmp, by = "weight_quant", suffixes = c("", "_min"))
tmp <- aggregate(weight ~ weight_quant, my_data, max)
my_data <- merge(my_data, tmp, by = "weight_quant", suffixes = c("", "_max"))
rm(tmp)
cols <- c("gender", "status",
"height_quant", "height_min", "height_max",
"weight_quant", "weight_min", "weight_max",
"disease")
tbl <- table(my_data[cols])
# compute proprotions table, don't show
# proportions(tbl)
ftbl <- ftable(tbl)
# compute proprotions table, don't show
# proportions(ftbl)
# coerce the tables to data.frames
df_tbl <- as.data.frame(tbl)
df_tbl_prop <- as.data.frame(proportions(tbl))
df_ftbl <- as.data.frame(ftbl)
df_ftbl_prop <- as.data.frame(proportions(ftbl))
# examples of results data.frames
# total counts
head(df_tbl)
#> gender status height_quant height_min height_max weight_quant weight_min weight_max disease Freq
#> 1 Female Citizen 0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 69
#> 2 Male Citizen 0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 61
#> 3 Female Immigrant 0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 22
#> 4 Male Immigrant 0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 26
#> 5 Female Citizen 25%-50% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 0
#> 6 Male Citizen 25%-50% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 0
# proportions of disease on total counts
head(df_ftbl_prop)
#> gender status height_quant height_min height_max weight_quant weight_min weight_max disease Freq
#> 1 Female Citizen 0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 0.0138
#> 2 Male Citizen 0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 0.0122
#> 3 Female Immigrant 0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 0.0044
#> 4 Male Immigrant 0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 0.0052
#> 5 Female Citizen 25%-50% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 0.0000
#> 6 Male Citizen 25%-50% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 0.0000
# now compute proportions of disease per gender,
# status and height and weight quantiles
df_tbl <- df_tbl[df_tbl$Freq != 0, ]
agg <- aggregate(Freq ~ gender + status +
height_quant + height_min + height_max +
height_quant + weight_min + weight_max,
df_tbl, \(x) x/sum(x))
colnames(agg[[ncol(agg)]]) <- paste0("disease.", 0:1)
agg <- cbind(agg[-ncol(agg)], agg[[ncol(agg)]])
agg[4:7] <- lapply(agg[4:7], \(x) as.numeric(as.character(x)))
head(agg)
#> gender status height_quant height_min height_max weight_min weight_max disease.0 disease.1
#> 1 Female Citizen 0%-25% Tallest 111.5468 143.1650 56.53098 83.49743 0.5036496 0.4963504
#> 2 Male Citizen 0%-25% Tallest 111.5468 143.1650 56.53098 83.49743 0.5495495 0.4504505
#> 3 Female Immigrant 0%-25% Tallest 111.5468 143.1650 56.53098 83.49743 0.4782609 0.5217391
#> 4 Male Immigrant 0%-25% Tallest 111.5468 143.1650 56.53098 83.49743 0.6842105 0.3157895
#> 5 Female Citizen 25%-50% Tallest 143.1659 149.8666 56.53098 83.49743 0.5172414 0.4827586
#> 6 Male Citizen 25%-50% Tallest 143.1659 149.8666 56.53098 83.49743 0.4313725 0.5686275
<sup>Created on 2023-07-10 with reprex v2.0.2</sup>
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论