R: 为嵌套数据集编写函数

huangapple go评论107阅读模式
英文:

R: Writing Functions for Nested Datasets

问题

我正在使用R编程语言进行工作。

我有以下数据集:

  1. set.seed(123)
  2. library(dplyr)
  3. Patient_ID = 1:5000
  4. gender <- c("Male","Female")
  5. gender <- sample(gender, 5000, replace=TRUE, prob=c(0.45, 0.55))
  6. gender <- as.factor(gender)
  7. status <- c("Immigrant","Citizen")
  8. status <- sample(status, 5000, replace=TRUE, prob=c(0.3, 0.7))
  9. status <- as.factor(status )
  10. height = rnorm(5000, 150, 10)
  11. weight = rnorm(5000, 90, 10)
  12. disease = sample(c(1, 0), 5000, replace = TRUE)
  13. my_data = data.frame(Patient_ID, gender, status, height, weight,disease)

以下是一个代码,我从中选择所有男性公民 - 然后选择最高的25%男性公民,然后从这25%最高的男性公民中选择最重的25%男性公民,并计算他们中有多少人患有疾病:

  1. part_1 = my_data[my_data$gender == "Male" & my_data$status == "Immigrant",]
  2. part_1 = part_1 %>% arrange(desc(height))
  3. limits = as.integer(seq(1, nrow(part_1), by = 0.25*nrow(part_1)))
  4. limits = c(limits, nrow(part_1))
  5. create_h <- function(part_1) {
  6. limits <- as.integer(seq(1, nrow(part_1), by = 0.25 * nrow(part_1)))
  7. limits <- c(limits, nrow(part_1))
  8. h_list <- list()
  9. for (i in 1:(length(limits) - 1)) {
  10. h_list[[i]] <- part_1[limits[i]:(limits[i + 1] - 1), ]
  11. }
  12. return(h_list)
  13. }
  14. h_list <- create_h(part_1)
  15. for (i in seq_along(h_list)) {
  16. assign(paste0("h_", i), h_list[[i]])
  17. }
  18. part_2 = h_1 %>% arrange(desc(weight))
  19. limits = as.integer(seq(1, nrow(part_2), by = 0.25*nrow(part_2)))
  20. limits = c(limits, nrow(part_2))
  21. create_w <- function(part_2) {
  22. limits <- as.integer(seq(1, nrow(part_2), by = 0.25 * nrow(part_2)))
  23. limits <- c(limits, nrow(part_2))
  24. w_list <- list()
  25. for (i in 1:(length(limits) - 1)) {
  26. w_list[[i]] <- part_1[limits[i]:(limits[i + 1] - 1), ]
  27. }
  28. return(w_list)
  29. }
  30. w_list <- create_h(part_2)
  31. for (i in seq_along(w_list)) {
  32. assign(paste0("w_", i), w_list[[i]])
  33. }
  34. r1 = data.frame(gender = "male", status = "immigrant", min_height = min(w_1$height), max_height = max(w_2$height), min_weight = min(w_1$weight), max_weight = max(w_1$weight), disease_rate = mean(w_1$disease), total_count = nrow(w_1), disease_count = sum(w_1$disease == "1") )

结果看起来类似于这样:

  1. gender status min_height max_height min_weight max_weight disease_rate total_count disease_count
  2. 1 male immigrant 157.7242 173.7773 94.77408 117.1924 0.6 40 24

我的问题是:我试图扩展此代码,以计算所有这些组合的疾病率。下面,我试图将这些组合表示为树状图(其中我的代码对应于“红框”):

R: 为嵌套数据集编写函数

(注意:这显然只是树的一小部分 - 完全绘制整个树几乎是不可能的)

有人可以向我展示如何使用我的一般代码结构为所有可能的组合创建结果r_1、r_2.... r_n,然后将它们附加到单个数据集中吗?是否有一种使用函数/循环的快速方法来实现这一点?

谢谢!

来源:https://www.smartdraw.com/software/tree-diagram-maker.htm

英文:

I am working with the R programming language.

I have the following dataset:

  1. set.seed(123)
  2. library(dplyr)
  3. Patient_ID = 1:5000
  4. gender &lt;- c(&quot;Male&quot;,&quot;Female&quot;)
  5. gender &lt;- sample(gender, 5000, replace=TRUE, prob=c(0.45, 0.55))
  6. gender &lt;- as.factor(gender)
  7. status &lt;- c(&quot;Immigrant&quot;,&quot;Citizen&quot;)
  8. status &lt;- sample(status, 5000, replace=TRUE, prob=c(0.3, 0.7))
  9. status &lt;- as.factor(status )
  10. height = rnorm(5000, 150, 10)
  11. weight = rnorm(5000, 90, 10)
  12. disease = sample(c(1, 0), 5000, replace = TRUE)
  13. my_data = data.frame(Patient_ID, gender, status, height, weight,disease)

Below is a code in which I take all male citizens - then take the 25% tallest male citizens, of these 25% tallest male citizens I take the 25% heaviest male citizens, and calculate how many of them have the disease:

  1. part_1 = my_data[my_data$gender == &quot;Male&quot; &amp; my_data$status == &quot;Immigrant&quot;,]
  2. part_1 = part_1 %&gt;% arrange(desc(height))
  3. limits = as.integer(seq(1, nrow(part_1), by = 0.25*nrow(part_1)))
  4. limits = c(limits, nrow(part_1))
  5. create_h &lt;- function(part_1) {
  6. limits &lt;- as.integer(seq(1, nrow(part_1), by = 0.25 * nrow(part_1)))
  7. limits &lt;- c(limits, nrow(part_1))
  8. h_list &lt;- list()
  9. for (i in 1:(length(limits) - 1)) {
  10. h_list[[i]] &lt;- part_1[limits[i]:(limits[i + 1] - 1), ]
  11. }
  12. return(h_list)
  13. }
  14. h_list &lt;- create_h(part_1)
  15. for (i in seq_along(h_list)) {
  16. assign(paste0(&quot;h_&quot;, i), h_list[[i]])
  17. }
  18. ###########################
  19. part_2 = h_1 %&gt;% arrange(desc(weight))
  20. limits = as.integer(seq(1, nrow(part_2), by = 0.25*nrow(part_2)))
  21. limits = c(limits, nrow(part_2))
  22. create_w &lt;- function(part_2) {
  23. limits &lt;- as.integer(seq(1, nrow(part_2), by = 0.25 * nrow(part_2)))
  24. limits &lt;- c(limits, nrow(part_2))
  25. w_list &lt;- list()
  26. for (i in 1:(length(limits) - 1)) {
  27. w_list[[i]] &lt;- part_1[limits[i]:(limits[i + 1] - 1), ]
  28. }
  29. return(w_list)
  30. }
  31. w_list &lt;- create_h(part_2)
  32. for (i in seq_along(w_list)) {
  33. assign(paste0(&quot;w_&quot;, i), w_list[[i]])
  34. }
  35. ##############################
  36. r1 = data.frame(gender = &quot;male&quot;, status = &quot;immigrant&quot;, min_height = min(w_1$height), max_height = max(w_2$height), min_weight = min(w_1$weight), max_weight = max(w_1$weight), disease_rate = mean(w_1$disease), total_count = nrow(w_1), disease_count = sum(w_1$disease == &quot;1&quot;) )

The result looks something like this:

  1. gender status min_height max_height min_weight max_weight disease_rate total_count disease_count
  2. 1 male immigrant 157.7242 173.7773 94.77408 117.1924 0.6 40 24

My Question: I am trying to extend this code to calculate the disease rate for all such combinations. Below, I tried to represent this as a tree diagram (in which my code corresponds to the "red box"):

R: 为嵌套数据集编写函数

(note: this is obviously just a small part of the tree - it is almost impossible to fully draw the whole tree)

Can someone please show me how I take my general code structure and create results r_1, r_2.... r_n for all possible combinations in this tree - and then append them into a single dataset? Is there a quick way to do this with function/loop based approach?

Thanks!

Sources: https://www.smartdraw.com/software/tree-diagram-maker.htm

答案1

得分: 3

这是一个使用dplyr的解决方案:

  1. set.seed(123)
  2. library(dplyr, warn.conflicts = FALSE)
  3. Patient_ID <- 1:5000
  4. gender <- c("Male", "Female")
  5. gender <- sample(gender, 5000, replace = TRUE, prob = c(0.45, 0.55))
  6. gender <- as.factor(gender)
  7. status <- c("Immigrant", "Citizen")
  8. status <- sample(status, 5000, replace = TRUE, prob = c(0.3, 0.7))
  9. status <- as.factor(status)
  10. height <- rnorm(5000, 150, 10)
  11. weight <- rnorm(5000, 90, 10)
  12. disease <- sample(c(1, 0), 5000, replace = TRUE)
  13. my_data <- data.frame(Patient_ID, gender, status, height, weight, disease)
  14. my_data %>%
  15. group_by(gender, status, ntile(height, 4), ntile(weight, 4)) %>%
  16. summarise(
  17. min_height = min(height),
  18. max_height = max(height),
  19. min_weight = min(weight),
  20. max_weight = max(weight),
  21. disease_rate = mean(disease),
  22. disease_count = sum(disease),
  23. total_count = n(),
  24. .groups = "keep"
  25. ) %>%
  26. ungroup() %>%
  27. mutate(
  28. height_group = factor(`ntile(height, 4)`,
  29. levels = c(1, 2, 3, 4), labels = c("0-25%", "25-50%", "50-75%", "75-100%")
  30. ),
  31. weight_group = factor(`ntile(weight, 4)`,
  32. levels = c(1, 2, 3, 4), labels = c("0-25%", "25-50%", "50-75%", "75-100%")
  33. )
  34. ) %>%
  35. select(-starts_with("ntile"))
  36. #> # A tibble: 64 × 11
  37. #> gender status min_height max_height min_weight max_weight disease_rate
  38. #> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
  39. #> 1 Female Citizen 123. 143. 59.9 83.5 0.496
  40. #> 2 Female Citizen 127. 143. 83.5 90.0 0.5
  41. #> 3 Female Citizen 116. 143. 90.3 97.0 0.496
  42. #> 4 Female Citizen 119. 143. 97.1 121. 0.558
  43. #> 5 Female Citizen 143. 150. 59.0 83.5 0.483
  44. #> 6 Female Citizen 143. 150. 83.5 90.0 0.508
  45. #> 7 Female Citizen 143. 150. 90.1 96.9 0.510
  46. #> 8 Female Citizen 143. 150. 97.1 119. 0.407
  47. #> 9 Female Citizen 150. 157. 60.1 83.5 0.5
  48. #> 10 Female Citizen 150. 157. 83.5 90.0 0.488
  49. #> # ℹ 54 more rows
  50. #> # ℹ 4 more variables: disease_count <dbl>, total_count <int>,
  51. #> # height_group <fct>, weight_group <fct>

创建于2023年7月10日,使用reprex v2.0.2

英文:

Here's a dplyr solution:

  1. set.seed(123)
  2. library(dplyr, warn.conflicts = FALSE)
  3. Patient_ID &lt;- 1:5000
  4. gender &lt;- c(&quot;Male&quot;, &quot;Female&quot;)
  5. gender &lt;- sample(gender, 5000, replace = TRUE, prob = c(0.45, 0.55))
  6. gender &lt;- as.factor(gender)
  7. status &lt;- c(&quot;Immigrant&quot;, &quot;Citizen&quot;)
  8. status &lt;- sample(status, 5000, replace = TRUE, prob = c(0.3, 0.7))
  9. status &lt;- as.factor(status)
  10. height &lt;- rnorm(5000, 150, 10)
  11. weight &lt;- rnorm(5000, 90, 10)
  12. disease &lt;- sample(c(1, 0), 5000, replace = TRUE)
  13. my_data &lt;- data.frame(Patient_ID, gender, status, height, weight, disease)
  14. my_data %&gt;%
  15. group_by(gender, status, ntile(height, 4), ntile(weight, 4)) %&gt;%
  16. summarise(
  17. min_height = min(height),
  18. max_height = max(height),
  19. min_weight = min(weight),
  20. max_weight = max(weight),
  21. disease_rate = mean(disease),
  22. disease_count = sum(disease),
  23. total_count = n(),
  24. .groups = &quot;keep&quot;
  25. ) %&gt;%
  26. ungroup() %&gt;%
  27. mutate(
  28. height_group = factor(`ntile(height, 4)`,
  29. levels = c(1, 2, 3, 4), labels = c(&quot;0-25%&quot;, &quot;25-50%&quot;, &quot;50-75%&quot;, &quot;75-100%&quot;)
  30. ),
  31. weight_group = factor(`ntile(weight, 4)`,
  32. levels = c(1, 2, 3, 4), labels = c(&quot;0-25%&quot;, &quot;25-50%&quot;, &quot;50-75%&quot;, &quot;75-100%&quot;)
  33. )
  34. ) %&gt;%
  35. select(-starts_with(&quot;ntile&quot;))
  36. #&gt; # A tibble: 64 &#215; 11
  37. #&gt; gender status min_height max_height min_weight max_weight disease_rate
  38. #&gt; &lt;fct&gt; &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt;
  39. #&gt; 1 Female Citizen 123. 143. 59.9 83.5 0.496
  40. #&gt; 2 Female Citizen 127. 143. 83.5 90.0 0.5
  41. #&gt; 3 Female Citizen 116. 143. 90.3 97.0 0.496
  42. #&gt; 4 Female Citizen 119. 143. 97.1 121. 0.558
  43. #&gt; 5 Female Citizen 143. 150. 59.0 83.5 0.483
  44. #&gt; 6 Female Citizen 143. 150. 83.5 90.0 0.508
  45. #&gt; 7 Female Citizen 143. 150. 90.1 96.9 0.510
  46. #&gt; 8 Female Citizen 143. 150. 97.1 119. 0.407
  47. #&gt; 9 Female Citizen 150. 157. 60.1 83.5 0.5
  48. #&gt; 10 Female Citizen 150. 157. 83.5 90.0 0.488
  49. #&gt; # ℹ 54 more rows
  50. #&gt; # ℹ 4 more variables: disease_count &lt;dbl&gt;, total_count &lt;int&gt;,
  51. #&gt; # height_group &lt;fct&gt;, weight_group &lt;fct&gt;

<sup>Created on 2023-07-10 with reprex v2.0.2</sup>

答案2

得分: 2

以下是您要翻译的内容:

This can be solved by first binning the heights and weights and then using table and aggregate.

  1. # use 'cut' to bin height and weight, set the
  2. # values to the quantiles' intervals
  3. qnt <- quantile(my_data$height)
  4. lbls <- paste(names(qnt)[-5], names(qnt)[-1], sep = "-")
  5. my_data$height_quant <- cut(my_data$height, qnt, labels = lbls, include.lowest = TRUE)
  6. my_data$height_quant <- paste(my_data$height_quant, "Tallest")
  7. # only compute the quantile's cut points, the labels
  8. # are built on the same lbls variable defined above
  9. qnt <- quantile(my_data$weight)
  10. my_data$weight_quant <- cut(my_data$weight, qnt, labels = lbls, include.lowest = TRUE)
  11. my_data$weight_quant <- paste(my_data$weight_quant, "Heaviest")
  12. cols <- c("gender", "status", "height_quant", "weight_quant", "disease")
  13. tbl <- table(my_data[cols])
  14. # compute proprotions table, don't show
  15. # proportions(tbl)
  16. ftbl <- ftable(tbl)
  17. # compute proprotions table, don't show
  18. # proportions(ftbl)
  19. # coerce the tables to data.frames
  20. df_tbl <- as.data.frame(tbl)
  21. df_tbl_prop <- as.data.frame(proportions(tbl))
  22. df_ftbl <- as.data.frame(ftbl)
  23. df_ftbl_prop <- as.data.frame(proportions(ftbl))
  24. # examples of results data.frames
  25. # total counts
  26. head(df_tbl)
  27. #> gender status height_quant weight_quant disease Freq
  28. #> 1 Female Citizen 0%-25% Tallest 0%-25% Heaviest 0 69
  29. #> 2 Male Citizen 0%-25% Tallest 0%-25% Heaviest 0 61
  30. #> 3 Female Immigrant 0%-25% Tallest 0%-25% Heaviest 0 22
  31. #> 4 Male Immigrant 0%-25% Tallest 0%-25% Heaviest 0 26
  32. #> 5 Female Citizen 25%-50% Tallest 0%-25% Heaviest 0 60
  33. #> 6 Male Citizen 25%-50% Tallest 0%-25% Heaviest 0 44
  34. # proportions of disease on total counts
  35. head(df_ftbl_prop)
  36. #> gender status height_quant weight_quant disease Freq
  37. #> 1 Female Citizen 0%-25% Tallest 0%-25% Heaviest 0 0.0138
  38. #> 2 Male Citizen 0%-25% Tallest 0%-25% Heaviest 0 0.0122
  39. #> 3 Female Immigrant 0%-25% Tallest 0%-25% Heaviest 0 0.0044
  40. #> 4 Male Immigrant 0%-25% Tallest 0%-25% Heaviest 0 0.0052
  41. #> 5 Female Citizen 25%-50% Tallest 0%-25% Heaviest 0 0.0120
  42. #> 6 Male Citizen 25%-50% Tallest 0%-25% Heaviest 0 0.0088
  43. # now compute proportions of disease per gender,
  44. # status and height and weight quantiles
  45. agg <- aggregate(Freq ~ gender + status + height_quant + weight_quant, df_tbl, \(x) x/sum(x))
  46. colnames(agg[[ncol(agg)]]) <- paste0("disease.", 0:1)
  47. agg <- cbind(agg[-ncol(agg)], agg[[ncol(agg)]])
  48. head(agg)
  49. #> gender status height_quant weight_quant disease.0 disease.1
  50. #> 1 Female Citizen 0%-25% Tallest 0%-25% Heaviest 0.5036496 0.4963504
  51. #> 2 Male Citizen 0%-25% Tallest 0%-25% Heaviest 0.5495495 0.4504505
  52. #> 3 Female Immigrant 0%-25% Tallest 0%-25% Heaviest 0.4782609 0.5217391
  53. #> 4 Male Immigrant 0%-25% Tallest 0%-25% Heaviest 0.6842105 0.3157895
  54. #> 5 Female Citizen 25%-50% Tallest 0%-25% Heaviest 0.5172414 0.4827586
  55. #> 6 Male Citizen 25%-50% Tallest 0%-25% Heaviest 0.4313725 0.5686275

<sup>Created on 2023-07-10 with reprex v2.0.2</sup>

Edit

To include min and max of height and weight turned out to be more complicated than expected. Here is a full example, duplicating most of the code above.

  1. # use 'cut' to bin height and weight, set the
  2. # values to the quantiles' intervals
  3. qnt <- quantile(my_data$height)
  4. lbls <- paste(names(qnt)[-5], names(qnt)[-1], sep = "-")
  5. my_data$height_quant <- cut(my_data$height, qnt, labels = lbls, include.lowest = TRUE)
  6. my_data$height_quant <- paste(my_data$height_quant, "Tallest")
  7. tmp <- aggregate(height ~ height_quant, my_data, min)
  8. my_data <- merge(my_data, tmp, by = "height_quant", suffixes = c("", "_min"))
  9. tmp <- aggregate(height ~ height_quant, my_data, max)
  10. my_data <- merge(my_data, tmp, by = "height_quant", suffixes = c("", "_max"))
  11. # only compute the quantile's cut points, the labels
  12. # are built on the same lbls variable defined above
  13. qnt <- quantile(my_data$weight)
  14. my_data$weight_quant <- cut(my_data$weight, qnt, labels = lbls, include.lowest = TRUE)
  15. my_data$weight_quant <- paste(my_data$weight_quant, "Heaviest")
  16. tmp <- aggregate(weight ~ weight_quant, my_data, min)
  17. my_data <- merge(my_data, tmp, by = "weight_quant", suffixes = c("", "_min"))
  18. tmp <- aggregate(weight ~ weight_quant, my_data, max)
  19. my_data <- merge(my_data, tmp, by = "weight_quant", suffixes = c("", "_max"))
  20. rm(tmp)
  21. cols <- c("gender", "status",
  22. "height_quant", "height_min", "height_max",
  23. "weight_quant", "weight_min", "weight_max",
  24. "disease")
  25. tbl <- table(my_data[cols])
  26. # compute proprotions table, don't
  27. <details>
  28. <summary>英文:</summary>
  29. This can be solved by first binning the heights and weights and then using `table` and `aggregate`.
  30. ``` r
  31. # use &#39;cut&#39; to bin height and weight, set the
  32. # values to the quantiles&#39; intervals
  33. qnt &lt;- quantile(my_data$height)
  34. lbls &lt;- paste(names(qnt)[-5], names(qnt)[-1], sep = &quot;-&quot;)
  35. my_data$height_quant &lt;- cut(my_data$height, qnt, labels = lbls, include.lowest = TRUE)
  36. my_data$height_quant &lt;- paste(my_data$height_quant, &quot;Tallest&quot;)
  37. # only compute the quantile&#39;s cut points, the labels
  38. # are built on the same lbls variable defined above
  39. qnt &lt;- quantile(my_data$weight)
  40. my_data$weight_quant &lt;- cut(my_data$weight, qnt, labels = lbls, include.lowest = TRUE)
  41. my_data$weight_quant &lt;- paste(my_data$weight_quant, &quot;Heaviest&quot;)
  42. cols &lt;- c(&quot;gender&quot;, &quot;status&quot;, &quot;height_quant&quot;, &quot;weight_quant&quot;, &quot;disease&quot;)
  43. tbl &lt;- table(my_data[cols])
  44. # compute proprotions table, don&#39;t show
  45. # proportions(tbl)
  46. ftbl &lt;- ftable(tbl)
  47. # compute proprotions table, don&#39;t show
  48. # proportions(ftbl)
  49. # coerce the tables to data.frames
  50. df_tbl &lt;- as.data.frame(tbl)
  51. df_tbl_prop &lt;- as.data.frame(proportions(tbl))
  52. df_ftbl &lt;- as.data.frame(ftbl)
  53. df_ftbl_prop &lt;- as.data.frame(proportions(ftbl))
  54. # examples of results data.frames
  55. # total counts
  56. head(df_tbl)
  57. #&gt; gender status height_quant weight_quant disease Freq
  58. #&gt; 1 Female Citizen 0%-25% Tallest 0%-25% Heaviest 0 69
  59. #&gt; 2 Male Citizen 0%-25% Tallest 0%-25% Heaviest 0 61
  60. #&gt; 3 Female Immigrant 0%-25% Tallest 0%-25% Heaviest 0 22
  61. #&gt; 4 Male Immigrant 0%-25% Tallest 0%-25% Heaviest 0 26
  62. #&gt; 5 Female Citizen 25%-50% Tallest 0%-25% Heaviest 0 60
  63. #&gt; 6 Male Citizen 25%-50% Tallest 0%-25% Heaviest 0 44
  64. # proportions of disease on total counts
  65. head(df_ftbl_prop)
  66. #&gt; gender status height_quant weight_quant disease Freq
  67. #&gt; 1 Female Citizen 0%-25% Tallest 0%-25% Heaviest 0 0.0138
  68. #&gt; 2 Male Citizen 0%-25% Tallest 0%-25% Heaviest 0 0.0122
  69. #&gt; 3 Female Immigrant 0%-25% Tallest 0%-25% Heaviest 0 0.0044
  70. #&gt; 4 Male Immigrant 0%-25% Tallest 0%-25% Heaviest 0 0.0052
  71. #&gt; 5 Female Citizen 25%-50% Tallest 0%-25% Heaviest 0 0.0120
  72. #&gt; 6 Male Citizen 25%-50% Tallest 0%-25% Heaviest 0 0.0088
  73. # now compute proportions of disease per gender,
  74. # status and height and weight quantiles
  75. agg &lt;- aggregate(Freq ~ gender + status + height_quant + weight_quant, df_tbl, \(x) x/sum(x))
  76. colnames(agg[[ncol(agg)]]) &lt;- paste0(&quot;disease.&quot;, 0:1)
  77. agg &lt;- cbind(agg[-ncol(agg)], agg[[ncol(agg)]])
  78. head(agg)
  79. #&gt; gender status height_quant weight_quant disease.0 disease.1
  80. #&gt; 1 Female Citizen 0%-25% Tallest 0%-25% Heaviest 0.5036496 0.4963504
  81. #&gt; 2 Male Citizen 0%-25% Tallest 0%-25% Heaviest 0.5495495 0.4504505
  82. #&gt; 3 Female Immigrant 0%-25% Tallest 0%-25% Heaviest 0.4782609 0.5217391
  83. #&gt; 4 Male Immigrant 0%-25% Tallest 0%-25% Heaviest 0.6842105 0.3157895
  84. #&gt; 5 Female Citizen 25%-50% Tallest 0%-25% Heaviest 0.5172414 0.4827586
  85. #&gt; 6 Male Citizen 25%-50% Tallest 0%-25% Heaviest 0.4313725 0.5686275

<sup>Created on 2023-07-10 with reprex v2.0.2</sup>


Edit

To include min and max of height and weight turned out to be more complicated than expected. Here is a full example, duplicating most of the code above. But getting the job done.

  1. # use &#39;cut&#39; to bin height and weight, set the
  2. # values to the quantiles&#39; intervals
  3. qnt &lt;- quantile(my_data$height)
  4. lbls &lt;- paste(names(qnt)[-5], names(qnt)[-1], sep = &quot;-&quot;)
  5. my_data$height_quant &lt;- cut(my_data$height, qnt, labels = lbls, include.lowest = TRUE)
  6. my_data$height_quant &lt;- paste(my_data$height_quant, &quot;Tallest&quot;)
  7. tmp &lt;- aggregate(height ~ height_quant, my_data, min)
  8. my_data &lt;- merge(my_data, tmp, by = &quot;height_quant&quot;, suffixes = c(&quot;&quot;, &quot;_min&quot;))
  9. tmp &lt;- aggregate(height ~ height_quant, my_data, max)
  10. my_data &lt;- merge(my_data, tmp, by = &quot;height_quant&quot;, suffixes = c(&quot;&quot;, &quot;_max&quot;))
  11. # only compute the quantile&#39;s cut points, the labels
  12. # are built on the same lbls variable defined above
  13. qnt &lt;- quantile(my_data$weight)
  14. my_data$weight_quant &lt;- cut(my_data$weight, qnt, labels = lbls, include.lowest = TRUE)
  15. my_data$weight_quant &lt;- paste(my_data$weight_quant, &quot;Heaviest&quot;)
  16. tmp &lt;- aggregate(weight ~ weight_quant, my_data, min)
  17. my_data &lt;- merge(my_data, tmp, by = &quot;weight_quant&quot;, suffixes = c(&quot;&quot;, &quot;_min&quot;))
  18. tmp &lt;- aggregate(weight ~ weight_quant, my_data, max)
  19. my_data &lt;- merge(my_data, tmp, by = &quot;weight_quant&quot;, suffixes = c(&quot;&quot;, &quot;_max&quot;))
  20. rm(tmp)
  21. cols &lt;- c(&quot;gender&quot;, &quot;status&quot;,
  22. &quot;height_quant&quot;, &quot;height_min&quot;, &quot;height_max&quot;,
  23. &quot;weight_quant&quot;, &quot;weight_min&quot;, &quot;weight_max&quot;,
  24. &quot;disease&quot;)
  25. tbl &lt;- table(my_data[cols])
  26. # compute proprotions table, don&#39;t show
  27. # proportions(tbl)
  28. ftbl &lt;- ftable(tbl)
  29. # compute proprotions table, don&#39;t show
  30. # proportions(ftbl)
  31. # coerce the tables to data.frames
  32. df_tbl &lt;- as.data.frame(tbl)
  33. df_tbl_prop &lt;- as.data.frame(proportions(tbl))
  34. df_ftbl &lt;- as.data.frame(ftbl)
  35. df_ftbl_prop &lt;- as.data.frame(proportions(ftbl))
  36. # examples of results data.frames
  37. # total counts
  38. head(df_tbl)
  39. #&gt; gender status height_quant height_min height_max weight_quant weight_min weight_max disease Freq
  40. #&gt; 1 Female Citizen 0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 69
  41. #&gt; 2 Male Citizen 0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 61
  42. #&gt; 3 Female Immigrant 0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 22
  43. #&gt; 4 Male Immigrant 0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 26
  44. #&gt; 5 Female Citizen 25%-50% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 0
  45. #&gt; 6 Male Citizen 25%-50% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 0
  46. # proportions of disease on total counts
  47. head(df_ftbl_prop)
  48. #&gt; gender status height_quant height_min height_max weight_quant weight_min weight_max disease Freq
  49. #&gt; 1 Female Citizen 0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 0.0138
  50. #&gt; 2 Male Citizen 0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 0.0122
  51. #&gt; 3 Female Immigrant 0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 0.0044
  52. #&gt; 4 Male Immigrant 0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 0.0052
  53. #&gt; 5 Female Citizen 25%-50% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 0.0000
  54. #&gt; 6 Male Citizen 25%-50% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399 0 0.0000
  55. # now compute proportions of disease per gender,
  56. # status and height and weight quantiles
  57. df_tbl &lt;- df_tbl[df_tbl$Freq != 0, ]
  58. agg &lt;- aggregate(Freq ~ gender + status +
  59. height_quant + height_min + height_max +
  60. height_quant + weight_min + weight_max,
  61. df_tbl, \(x) x/sum(x))
  62. colnames(agg[[ncol(agg)]]) &lt;- paste0(&quot;disease.&quot;, 0:1)
  63. agg &lt;- cbind(agg[-ncol(agg)], agg[[ncol(agg)]])
  64. agg[4:7] &lt;- lapply(agg[4:7], \(x) as.numeric(as.character(x)))
  65. head(agg)
  66. #&gt; gender status height_quant height_min height_max weight_min weight_max disease.0 disease.1
  67. #&gt; 1 Female Citizen 0%-25% Tallest 111.5468 143.1650 56.53098 83.49743 0.5036496 0.4963504
  68. #&gt; 2 Male Citizen 0%-25% Tallest 111.5468 143.1650 56.53098 83.49743 0.5495495 0.4504505
  69. #&gt; 3 Female Immigrant 0%-25% Tallest 111.5468 143.1650 56.53098 83.49743 0.4782609 0.5217391
  70. #&gt; 4 Male Immigrant 0%-25% Tallest 111.5468 143.1650 56.53098 83.49743 0.6842105 0.3157895
  71. #&gt; 5 Female Citizen 25%-50% Tallest 143.1659 149.8666 56.53098 83.49743 0.5172414 0.4827586
  72. #&gt; 6 Male Citizen 25%-50% Tallest 143.1659 149.8666 56.53098 83.49743 0.4313725 0.5686275

<sup>Created on 2023-07-10 with reprex v2.0.2</sup>

huangapple
  • 本文由 发表于 2023年7月10日 12:10:32
  • 转载请务必保留本文链接:https://go.coder-hub.com/76650638.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定