R: 为嵌套数据集编写函数

huangapple go评论59阅读模式
英文:

R: Writing Functions for Nested Datasets

问题

我正在使用R编程语言进行工作。

我有以下数据集:

set.seed(123)
library(dplyr)

Patient_ID = 1:5000
gender <- c("Male","Female")
gender <- sample(gender, 5000, replace=TRUE, prob=c(0.45, 0.55))
gender <- as.factor(gender)

status <- c("Immigrant","Citizen")
status <- sample(status, 5000, replace=TRUE, prob=c(0.3, 0.7))
status  <- as.factor(status )

height = rnorm(5000, 150, 10)
weight = rnorm(5000, 90, 10)

disease = sample(c(1, 0), 5000, replace = TRUE)

my_data = data.frame(Patient_ID, gender, status, height, weight,disease)

以下是一个代码,我从中选择所有男性公民 - 然后选择最高的25%男性公民,然后从这25%最高的男性公民中选择最重的25%男性公民,并计算他们中有多少人患有疾病:

part_1 = my_data[my_data$gender == "Male" & my_data$status == "Immigrant",]
part_1 = part_1 %>% arrange(desc(height))

limits = as.integer(seq(1, nrow(part_1), by = 0.25*nrow(part_1)))
limits = c(limits, nrow(part_1))

create_h <- function(part_1) {
    limits <- as.integer(seq(1, nrow(part_1), by = 0.25 * nrow(part_1)))
    limits <- c(limits, nrow(part_1))
    h_list <- list()
    for (i in 1:(length(limits) - 1)) {
        h_list[[i]] <- part_1[limits[i]:(limits[i + 1] - 1), ]
    }
    return(h_list)
}

h_list <- create_h(part_1)

for (i in seq_along(h_list)) {
    assign(paste0("h_", i), h_list[[i]])
}

part_2 = h_1 %>% arrange(desc(weight))

limits = as.integer(seq(1, nrow(part_2), by = 0.25*nrow(part_2)))
limits = c(limits, nrow(part_2))

create_w <- function(part_2) {
    limits <- as.integer(seq(1, nrow(part_2), by = 0.25 * nrow(part_2)))
    limits <- c(limits, nrow(part_2))
    w_list <- list()
    for (i in 1:(length(limits) - 1)) {
        w_list[[i]] <- part_1[limits[i]:(limits[i + 1] - 1), ]
    }
    return(w_list)
}

w_list <- create_h(part_2)

for (i in seq_along(w_list)) {
    assign(paste0("w_", i), w_list[[i]])
}

r1 = data.frame(gender = "male", status = "immigrant", min_height = min(w_1$height), max_height = max(w_2$height), min_weight = min(w_1$weight), max_weight = max(w_1$weight), disease_rate = mean(w_1$disease), total_count = nrow(w_1), disease_count = sum(w_1$disease == "1") )

结果看起来类似于这样:

  gender    status min_height max_height min_weight max_weight disease_rate total_count disease_count
1   male immigrant   157.7242   173.7773   94.77408   117.1924          0.6          40            24

我的问题是:我试图扩展此代码,以计算所有这些组合的疾病率。下面,我试图将这些组合表示为树状图(其中我的代码对应于“红框”):

R: 为嵌套数据集编写函数

(注意:这显然只是树的一小部分 - 完全绘制整个树几乎是不可能的)

有人可以向我展示如何使用我的一般代码结构为所有可能的组合创建结果r_1、r_2.... r_n,然后将它们附加到单个数据集中吗?是否有一种使用函数/循环的快速方法来实现这一点?

谢谢!

来源:https://www.smartdraw.com/software/tree-diagram-maker.htm

英文:

I am working with the R programming language.

I have the following dataset:

set.seed(123)
library(dplyr)
Patient_ID = 1:5000
gender &lt;- c(&quot;Male&quot;,&quot;Female&quot;)
gender &lt;- sample(gender, 5000, replace=TRUE, prob=c(0.45, 0.55))
gender &lt;- as.factor(gender)
status &lt;- c(&quot;Immigrant&quot;,&quot;Citizen&quot;)
status &lt;- sample(status, 5000, replace=TRUE, prob=c(0.3, 0.7))
status  &lt;- as.factor(status )
height = rnorm(5000, 150, 10)
weight = rnorm(5000, 90, 10)
disease = sample(c(1, 0), 5000, replace = TRUE)
my_data = data.frame(Patient_ID, gender, status, height, weight,disease)

Below is a code in which I take all male citizens - then take the 25% tallest male citizens, of these 25% tallest male citizens I take the 25% heaviest male citizens, and calculate how many of them have the disease:

part_1 = my_data[my_data$gender == &quot;Male&quot; &amp; my_data$status == &quot;Immigrant&quot;,]
part_1 = part_1 %&gt;% arrange(desc(height))
limits = as.integer(seq(1, nrow(part_1), by = 0.25*nrow(part_1)))
limits = c(limits, nrow(part_1))
create_h &lt;- function(part_1) {
limits &lt;- as.integer(seq(1, nrow(part_1), by = 0.25 * nrow(part_1)))
limits &lt;- c(limits, nrow(part_1))
h_list &lt;- list()
for (i in 1:(length(limits) - 1)) {
h_list[[i]] &lt;- part_1[limits[i]:(limits[i + 1] - 1), ]
}
return(h_list)
}
h_list &lt;- create_h(part_1)
for (i in seq_along(h_list)) {
assign(paste0(&quot;h_&quot;, i), h_list[[i]])
}
###########################
part_2 = h_1 %&gt;% arrange(desc(weight))
limits = as.integer(seq(1, nrow(part_2), by = 0.25*nrow(part_2)))
limits = c(limits, nrow(part_2))
create_w &lt;- function(part_2) {
limits &lt;- as.integer(seq(1, nrow(part_2), by = 0.25 * nrow(part_2)))
limits &lt;- c(limits, nrow(part_2))
w_list &lt;- list()
for (i in 1:(length(limits) - 1)) {
w_list[[i]] &lt;- part_1[limits[i]:(limits[i + 1] - 1), ]
}
return(w_list)
}
w_list &lt;- create_h(part_2)
for (i in seq_along(w_list)) {
assign(paste0(&quot;w_&quot;, i), w_list[[i]])
}
##############################
r1 = data.frame(gender = &quot;male&quot;, status = &quot;immigrant&quot;, min_height = min(w_1$height), max_height = max(w_2$height), min_weight = min(w_1$weight), max_weight = max(w_1$weight), disease_rate = mean(w_1$disease), total_count = nrow(w_1), disease_count = sum(w_1$disease == &quot;1&quot;) )

The result looks something like this:

  gender    status min_height max_height min_weight max_weight disease_rate total_count disease_count
1   male immigrant   157.7242   173.7773   94.77408   117.1924          0.6          40            24

My Question: I am trying to extend this code to calculate the disease rate for all such combinations. Below, I tried to represent this as a tree diagram (in which my code corresponds to the "red box"):

R: 为嵌套数据集编写函数

(note: this is obviously just a small part of the tree - it is almost impossible to fully draw the whole tree)

Can someone please show me how I take my general code structure and create results r_1, r_2.... r_n for all possible combinations in this tree - and then append them into a single dataset? Is there a quick way to do this with function/loop based approach?

Thanks!

Sources: https://www.smartdraw.com/software/tree-diagram-maker.htm

答案1

得分: 3

这是一个使用dplyr的解决方案:

set.seed(123)
library(dplyr, warn.conflicts = FALSE)

Patient_ID <- 1:5000
gender <- c("Male", "Female")
gender <- sample(gender, 5000, replace = TRUE, prob = c(0.45, 0.55))
gender <- as.factor(gender)

status <- c("Immigrant", "Citizen")
status <- sample(status, 5000, replace = TRUE, prob = c(0.3, 0.7))
status <- as.factor(status)

height <- rnorm(5000, 150, 10)
weight <- rnorm(5000, 90, 10)

disease <- sample(c(1, 0), 5000, replace = TRUE)

my_data <- data.frame(Patient_ID, gender, status, height, weight, disease)

my_data %>%
  group_by(gender, status, ntile(height, 4), ntile(weight, 4)) %>%
  summarise(
    min_height = min(height),
    max_height = max(height),
    min_weight = min(weight),
    max_weight = max(weight),
    disease_rate = mean(disease),
    disease_count = sum(disease),
    total_count = n(),
    .groups = "keep"
  ) %>%
  ungroup() %>%
  mutate(
    height_group = factor(`ntile(height, 4)`,
      levels = c(1, 2, 3, 4), labels = c("0-25%", "25-50%", "50-75%", "75-100%")
    ),
    weight_group = factor(`ntile(weight, 4)`,
      levels = c(1, 2, 3, 4), labels = c("0-25%", "25-50%", "50-75%", "75-100%")
    )
  ) %>%
  select(-starts_with("ntile"))
#> # A tibble: 64 × 11
#>    gender status  min_height max_height min_weight max_weight disease_rate
#>    <fct>  <fct>        <dbl>      <dbl>      <dbl>      <dbl>        <dbl>
#>  1 Female Citizen       123.       143.       59.9       83.5        0.496
#>  2 Female Citizen       127.       143.       83.5       90.0        0.5  
#>  3 Female Citizen       116.       143.       90.3       97.0        0.496
#>  4 Female Citizen       119.       143.       97.1      121.         0.558
#>  5 Female Citizen       143.       150.       59.0       83.5        0.483
#>  6 Female Citizen       143.       150.       83.5       90.0        0.508
#>  7 Female Citizen       143.       150.       90.1       96.9        0.510
#>  8 Female Citizen       143.       150.       97.1      119.         0.407
#>  9 Female Citizen       150.       157.       60.1       83.5        0.5  
#> 10 Female Citizen       150.       157.       83.5       90.0        0.488
#> # ℹ 54 more rows
#> # ℹ 4 more variables: disease_count <dbl>, total_count <int>,
#> #   height_group <fct>, weight_group <fct>

创建于2023年7月10日,使用reprex v2.0.2

英文:

Here's a dplyr solution:

set.seed(123)
library(dplyr, warn.conflicts = FALSE)

Patient_ID &lt;- 1:5000
gender &lt;- c(&quot;Male&quot;, &quot;Female&quot;)
gender &lt;- sample(gender, 5000, replace = TRUE, prob = c(0.45, 0.55))
gender &lt;- as.factor(gender)


status &lt;- c(&quot;Immigrant&quot;, &quot;Citizen&quot;)
status &lt;- sample(status, 5000, replace = TRUE, prob = c(0.3, 0.7))
status &lt;- as.factor(status)

height &lt;- rnorm(5000, 150, 10)
weight &lt;- rnorm(5000, 90, 10)

disease &lt;- sample(c(1, 0), 5000, replace = TRUE)

my_data &lt;- data.frame(Patient_ID, gender, status, height, weight, disease)

my_data %&gt;%
  group_by(gender, status, ntile(height, 4), ntile(weight, 4)) %&gt;%
  summarise(
    min_height = min(height),
    max_height = max(height),
    min_weight = min(weight),
    max_weight = max(weight),
    disease_rate = mean(disease),
    disease_count = sum(disease),
    total_count = n(),
    .groups = &quot;keep&quot;
  ) %&gt;%
  ungroup() %&gt;%
  mutate(
    height_group = factor(`ntile(height, 4)`,
      levels = c(1, 2, 3, 4), labels = c(&quot;0-25%&quot;, &quot;25-50%&quot;, &quot;50-75%&quot;, &quot;75-100%&quot;)
    ),
    weight_group = factor(`ntile(weight, 4)`,
      levels = c(1, 2, 3, 4), labels = c(&quot;0-25%&quot;, &quot;25-50%&quot;, &quot;50-75%&quot;, &quot;75-100%&quot;)
    )
  ) %&gt;%
  select(-starts_with(&quot;ntile&quot;))
#&gt; # A tibble: 64 &#215; 11
#&gt;    gender status  min_height max_height min_weight max_weight disease_rate
#&gt;    &lt;fct&gt;  &lt;fct&gt;        &lt;dbl&gt;      &lt;dbl&gt;      &lt;dbl&gt;      &lt;dbl&gt;        &lt;dbl&gt;
#&gt;  1 Female Citizen       123.       143.       59.9       83.5        0.496
#&gt;  2 Female Citizen       127.       143.       83.5       90.0        0.5  
#&gt;  3 Female Citizen       116.       143.       90.3       97.0        0.496
#&gt;  4 Female Citizen       119.       143.       97.1      121.         0.558
#&gt;  5 Female Citizen       143.       150.       59.0       83.5        0.483
#&gt;  6 Female Citizen       143.       150.       83.5       90.0        0.508
#&gt;  7 Female Citizen       143.       150.       90.1       96.9        0.510
#&gt;  8 Female Citizen       143.       150.       97.1      119.         0.407
#&gt;  9 Female Citizen       150.       157.       60.1       83.5        0.5  
#&gt; 10 Female Citizen       150.       157.       83.5       90.0        0.488
#&gt; # ℹ 54 more rows
#&gt; # ℹ 4 more variables: disease_count &lt;dbl&gt;, total_count &lt;int&gt;,
#&gt; #   height_group &lt;fct&gt;, weight_group &lt;fct&gt;

<sup>Created on 2023-07-10 with reprex v2.0.2</sup>

答案2

得分: 2

以下是您要翻译的内容:

This can be solved by first binning the heights and weights and then using table and aggregate.

# use 'cut' to bin height and weight, set the 
# values to the quantiles' intervals
qnt <- quantile(my_data$height)
lbls <- paste(names(qnt)[-5], names(qnt)[-1], sep = "-")
my_data$height_quant <- cut(my_data$height, qnt, labels = lbls, include.lowest = TRUE)
my_data$height_quant <- paste(my_data$height_quant, "Tallest")

# only compute the quantile's cut points, the labels 
# are built on the same lbls variable defined above
qnt <- quantile(my_data$weight)
my_data$weight_quant <- cut(my_data$weight, qnt, labels = lbls, include.lowest = TRUE)
my_data$weight_quant <- paste(my_data$weight_quant, "Heaviest")

cols <- c("gender", "status", "height_quant", "weight_quant", "disease")

tbl <- table(my_data[cols])
# compute proprotions table, don't show
# proportions(tbl)

ftbl <- ftable(tbl)
# compute proprotions table, don't show
# proportions(ftbl)

# coerce the tables to data.frames
df_tbl <- as.data.frame(tbl)
df_tbl_prop <- as.data.frame(proportions(tbl))
df_ftbl <- as.data.frame(ftbl)
df_ftbl_prop <- as.data.frame(proportions(ftbl))

# examples of results data.frames
# total counts
head(df_tbl)
#>   gender    status    height_quant    weight_quant disease Freq
#> 1 Female   Citizen  0%-25% Tallest 0%-25% Heaviest       0   69
#> 2   Male   Citizen  0%-25% Tallest 0%-25% Heaviest       0   61
#> 3 Female Immigrant  0%-25% Tallest 0%-25% Heaviest       0   22
#> 4   Male Immigrant  0%-25% Tallest 0%-25% Heaviest       0   26
#> 5 Female   Citizen 25%-50% Tallest 0%-25% Heaviest       0   60
#> 6   Male   Citizen 25%-50% Tallest 0%-25% Heaviest       0   44

# proportions of disease on total counts
head(df_ftbl_prop)
#>   gender    status    height_quant    weight_quant disease   Freq
#> 1 Female   Citizen  0%-25% Tallest 0%-25% Heaviest       0 0.0138
#> 2   Male   Citizen  0%-25% Tallest 0%-25% Heaviest       0 0.0122
#> 3 Female Immigrant  0%-25% Tallest 0%-25% Heaviest       0 0.0044
#> 4   Male Immigrant  0%-25% Tallest 0%-25% Heaviest       0 0.0052
#> 5 Female   Citizen 25%-50% Tallest 0%-25% Heaviest       0 0.0120
#> 6   Male   Citizen 25%-50% Tallest 0%-25% Heaviest       0 0.0088

# now compute proportions of disease per gender,
# status and height and weight quantiles
agg <- aggregate(Freq ~ gender + status + height_quant + weight_quant, df_tbl, \(x) x/sum(x))
colnames(agg[[ncol(agg)]]) <- paste0("disease.", 0:1)
agg <- cbind(agg[-ncol(agg)], agg[[ncol(agg)]])

head(agg)
#>   gender    status    height_quant    weight_quant disease.0 disease.1
#> 1 Female   Citizen  0%-25% Tallest 0%-25% Heaviest 0.5036496 0.4963504
#> 2   Male   Citizen  0%-25% Tallest 0%-25% Heaviest 0.5495495 0.4504505
#> 3 Female Immigrant  0%-25% Tallest 0%-25% Heaviest 0.4782609 0.5217391
#> 4   Male Immigrant  0%-25% Tallest 0%-25% Heaviest 0.6842105 0.3157895
#> 5 Female   Citizen 25%-50% Tallest 0%-25% Heaviest 0.5172414 0.4827586
#> 6   Male   Citizen 25%-50% Tallest 0%-25% Heaviest 0.4313725 0.5686275

<sup>Created on 2023-07-10 with reprex v2.0.2</sup>

Edit

To include min and max of height and weight turned out to be more complicated than expected. Here is a full example, duplicating most of the code above.

# use 'cut' to bin height and weight, set the 
# values to the quantiles' intervals
qnt <- quantile(my_data$height)
lbls <- paste(names(qnt)[-5], names(qnt)[-1], sep = "-")
my_data$height_quant <- cut(my_data$height, qnt, labels = lbls, include.lowest = TRUE)
my_data$height_quant <- paste(my_data$height_quant, "Tallest")
tmp <- aggregate(height ~ height_quant, my_data, min)
my_data <- merge(my_data, tmp, by = "height_quant", suffixes = c("", "_min"))
tmp <- aggregate(height ~ height_quant, my_data, max)
my_data <- merge(my_data, tmp, by = "height_quant", suffixes = c("", "_max"))

# only compute the quantile's cut points, the labels 
# are built on the same lbls variable defined above
qnt <- quantile(my_data$weight)
my_data$weight_quant <- cut(my_data$weight, qnt, labels = lbls, include.lowest = TRUE)
my_data$weight_quant <- paste(my_data$weight_quant, "Heaviest")
tmp <- aggregate(weight ~ weight_quant, my_data, min)
my_data <- merge(my_data, tmp, by = "weight_quant", suffixes = c("", "_min"))
tmp <- aggregate(weight ~ weight_quant, my_data, max)
my_data <- merge(my_data, tmp, by = "weight_quant", suffixes = c("", "_max"))
rm(tmp)

cols <- c("gender", "status", 
          "height_quant", "height_min", "height_max",
          "weight_quant", "weight_min", "weight_max",
          "disease")

tbl <- table(my_data[cols])
# compute proprotions table, don't

<details>
<summary>英文:</summary>

This can be solved by first binning the heights and weights and then using `table` and `aggregate`.

``` r
# use &#39;cut&#39; to bin height and weight, set the 
# values to the quantiles&#39; intervals
qnt &lt;- quantile(my_data$height)
lbls &lt;- paste(names(qnt)[-5], names(qnt)[-1], sep = &quot;-&quot;)
my_data$height_quant &lt;- cut(my_data$height, qnt, labels = lbls, include.lowest = TRUE)
my_data$height_quant &lt;- paste(my_data$height_quant, &quot;Tallest&quot;)

# only compute the quantile&#39;s cut points, the labels 
# are built on the same lbls variable defined above
qnt &lt;- quantile(my_data$weight)
my_data$weight_quant &lt;- cut(my_data$weight, qnt, labels = lbls, include.lowest = TRUE)
my_data$weight_quant &lt;- paste(my_data$weight_quant, &quot;Heaviest&quot;)

cols &lt;- c(&quot;gender&quot;, &quot;status&quot;, &quot;height_quant&quot;, &quot;weight_quant&quot;, &quot;disease&quot;)

tbl &lt;- table(my_data[cols])
# compute proprotions table, don&#39;t show
# proportions(tbl)

ftbl &lt;- ftable(tbl)
# compute proprotions table, don&#39;t show
# proportions(ftbl)

# coerce the tables to data.frames
df_tbl &lt;- as.data.frame(tbl)
df_tbl_prop &lt;- as.data.frame(proportions(tbl))
df_ftbl &lt;- as.data.frame(ftbl)
df_ftbl_prop &lt;- as.data.frame(proportions(ftbl))

# examples of results data.frames
# total counts
head(df_tbl)
#&gt;   gender    status    height_quant    weight_quant disease Freq
#&gt; 1 Female   Citizen  0%-25% Tallest 0%-25% Heaviest       0   69
#&gt; 2   Male   Citizen  0%-25% Tallest 0%-25% Heaviest       0   61
#&gt; 3 Female Immigrant  0%-25% Tallest 0%-25% Heaviest       0   22
#&gt; 4   Male Immigrant  0%-25% Tallest 0%-25% Heaviest       0   26
#&gt; 5 Female   Citizen 25%-50% Tallest 0%-25% Heaviest       0   60
#&gt; 6   Male   Citizen 25%-50% Tallest 0%-25% Heaviest       0   44

# proportions of disease on total counts
head(df_ftbl_prop)
#&gt;   gender    status    height_quant    weight_quant disease   Freq
#&gt; 1 Female   Citizen  0%-25% Tallest 0%-25% Heaviest       0 0.0138
#&gt; 2   Male   Citizen  0%-25% Tallest 0%-25% Heaviest       0 0.0122
#&gt; 3 Female Immigrant  0%-25% Tallest 0%-25% Heaviest       0 0.0044
#&gt; 4   Male Immigrant  0%-25% Tallest 0%-25% Heaviest       0 0.0052
#&gt; 5 Female   Citizen 25%-50% Tallest 0%-25% Heaviest       0 0.0120
#&gt; 6   Male   Citizen 25%-50% Tallest 0%-25% Heaviest       0 0.0088

# now compute proportions of disease per gender,
# status and height and weight quantiles
agg &lt;- aggregate(Freq ~ gender + status + height_quant + weight_quant, df_tbl, \(x) x/sum(x))
colnames(agg[[ncol(agg)]]) &lt;- paste0(&quot;disease.&quot;, 0:1)
agg &lt;- cbind(agg[-ncol(agg)], agg[[ncol(agg)]])

head(agg)
#&gt;   gender    status    height_quant    weight_quant disease.0 disease.1
#&gt; 1 Female   Citizen  0%-25% Tallest 0%-25% Heaviest 0.5036496 0.4963504
#&gt; 2   Male   Citizen  0%-25% Tallest 0%-25% Heaviest 0.5495495 0.4504505
#&gt; 3 Female Immigrant  0%-25% Tallest 0%-25% Heaviest 0.4782609 0.5217391
#&gt; 4   Male Immigrant  0%-25% Tallest 0%-25% Heaviest 0.6842105 0.3157895
#&gt; 5 Female   Citizen 25%-50% Tallest 0%-25% Heaviest 0.5172414 0.4827586
#&gt; 6   Male   Citizen 25%-50% Tallest 0%-25% Heaviest 0.4313725 0.5686275

<sup>Created on 2023-07-10 with reprex v2.0.2</sup>


Edit

To include min and max of height and weight turned out to be more complicated than expected. Here is a full example, duplicating most of the code above. But getting the job done.

# use &#39;cut&#39; to bin height and weight, set the 
# values to the quantiles&#39; intervals
qnt &lt;- quantile(my_data$height)
lbls &lt;- paste(names(qnt)[-5], names(qnt)[-1], sep = &quot;-&quot;)
my_data$height_quant &lt;- cut(my_data$height, qnt, labels = lbls, include.lowest = TRUE)
my_data$height_quant &lt;- paste(my_data$height_quant, &quot;Tallest&quot;)
tmp &lt;- aggregate(height ~ height_quant, my_data, min)
my_data &lt;- merge(my_data, tmp, by = &quot;height_quant&quot;, suffixes = c(&quot;&quot;, &quot;_min&quot;))
tmp &lt;- aggregate(height ~ height_quant, my_data, max)
my_data &lt;- merge(my_data, tmp, by = &quot;height_quant&quot;, suffixes = c(&quot;&quot;, &quot;_max&quot;))

# only compute the quantile&#39;s cut points, the labels 
# are built on the same lbls variable defined above
qnt &lt;- quantile(my_data$weight)
my_data$weight_quant &lt;- cut(my_data$weight, qnt, labels = lbls, include.lowest = TRUE)
my_data$weight_quant &lt;- paste(my_data$weight_quant, &quot;Heaviest&quot;)
tmp &lt;- aggregate(weight ~ weight_quant, my_data, min)
my_data &lt;- merge(my_data, tmp, by = &quot;weight_quant&quot;, suffixes = c(&quot;&quot;, &quot;_min&quot;))
tmp &lt;- aggregate(weight ~ weight_quant, my_data, max)
my_data &lt;- merge(my_data, tmp, by = &quot;weight_quant&quot;, suffixes = c(&quot;&quot;, &quot;_max&quot;))
rm(tmp)

cols &lt;- c(&quot;gender&quot;, &quot;status&quot;, 
          &quot;height_quant&quot;, &quot;height_min&quot;, &quot;height_max&quot;,
          &quot;weight_quant&quot;, &quot;weight_min&quot;, &quot;weight_max&quot;,
          &quot;disease&quot;)

tbl &lt;- table(my_data[cols])
# compute proprotions table, don&#39;t show
# proportions(tbl)

ftbl &lt;- ftable(tbl)
# compute proprotions table, don&#39;t show
# proportions(ftbl)

# coerce the tables to data.frames
df_tbl &lt;- as.data.frame(tbl)
df_tbl_prop &lt;- as.data.frame(proportions(tbl))
df_ftbl &lt;- as.data.frame(ftbl)
df_ftbl_prop &lt;- as.data.frame(proportions(ftbl))

# examples of results data.frames
# total counts
head(df_tbl)
#&gt;   gender    status    height_quant     height_min       height_max    weight_quant       weight_min       weight_max disease Freq
#&gt; 1 Female   Citizen  0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399       0   69
#&gt; 2   Male   Citizen  0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399       0   61
#&gt; 3 Female Immigrant  0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399       0   22
#&gt; 4   Male Immigrant  0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399       0   26
#&gt; 5 Female   Citizen 25%-50% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399       0    0
#&gt; 6   Male   Citizen 25%-50% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399       0    0

# proportions of disease on total counts
head(df_ftbl_prop)
#&gt;   gender    status    height_quant     height_min       height_max    weight_quant       weight_min       weight_max disease   Freq
#&gt; 1 Female   Citizen  0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399       0 0.0138
#&gt; 2   Male   Citizen  0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399       0 0.0122
#&gt; 3 Female Immigrant  0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399       0 0.0044
#&gt; 4   Male Immigrant  0%-25% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399       0 0.0052
#&gt; 5 Female   Citizen 25%-50% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399       0 0.0000
#&gt; 6   Male   Citizen 25%-50% Tallest 111.5467983054 143.165045267515 0%-25% Heaviest 56.5309750696496 83.4974265243399       0 0.0000

# now compute proportions of disease per gender,
# status and height and weight quantiles
df_tbl &lt;- df_tbl[df_tbl$Freq != 0, ]
agg &lt;- aggregate(Freq ~ gender + status + 
                   height_quant + height_min + height_max + 
                   height_quant + weight_min + weight_max, 
                 df_tbl, \(x) x/sum(x))
colnames(agg[[ncol(agg)]]) &lt;- paste0(&quot;disease.&quot;, 0:1)
agg &lt;- cbind(agg[-ncol(agg)], agg[[ncol(agg)]])
agg[4:7] &lt;- lapply(agg[4:7], \(x) as.numeric(as.character(x)))

head(agg)
#&gt;   gender    status    height_quant height_min height_max weight_min weight_max disease.0 disease.1
#&gt; 1 Female   Citizen  0%-25% Tallest   111.5468   143.1650   56.53098   83.49743 0.5036496 0.4963504
#&gt; 2   Male   Citizen  0%-25% Tallest   111.5468   143.1650   56.53098   83.49743 0.5495495 0.4504505
#&gt; 3 Female Immigrant  0%-25% Tallest   111.5468   143.1650   56.53098   83.49743 0.4782609 0.5217391
#&gt; 4   Male Immigrant  0%-25% Tallest   111.5468   143.1650   56.53098   83.49743 0.6842105 0.3157895
#&gt; 5 Female   Citizen 25%-50% Tallest   143.1659   149.8666   56.53098   83.49743 0.5172414 0.4827586
#&gt; 6   Male   Citizen 25%-50% Tallest   143.1659   149.8666   56.53098   83.49743 0.4313725 0.5686275

<sup>Created on 2023-07-10 with reprex v2.0.2</sup>

huangapple
  • 本文由 发表于 2023年7月10日 12:10:32
  • 转载请务必保留本文链接:https://go.coder-hub.com/76650638.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定