英文:
Generate summary statistics by subgroup with stargazer
问题
我尝试使用R中的stargazer
生成Latex的摘要统计表。表格应该按三个子组(卢旺达/洪都拉斯/尼泊尔)对摘要统计进行排序。
当我仅为子组生成单独的表格时,它运行正常。我认为可能是国家变量引起的问题。
all_summary
数据框的结构如下:
structure(list(country = structure(c("Honduras", "Nepal", "Rwanda"), label = "Country", format.stata = "%8s"), headGender = structure(c(0, 1, 0), label = "head_gender", format.stata = "%9.0g"), femaleEduc = structure(c(1, 2, 2), label = "female_educ", format.stata = "%9.0g"), maleEduc = structure(c(1, 1, 2), label = "male_educ", format.stata = "%9.0g"), wVispeople = structure(c(0, 1, 0), label = "w_visitpeople", format.stata = "%9.0g"), wVismarket = structure(c(0, 1, 1), label = "w_vismarket", format.stata = "%9.0g"), wLeavevill = structure(c(0, 1, 0), label = "w_leavevill", format.stata = "%9.0g"), fridge = structure(c(1, 0, 0), label = "fridge_owned_desired", format.stata = "%9.0g"), radio = structure(c(1, 1, 1), label = "radio_owned_desired", format.stata = "%9.0g"), fan = structure(c(0, 0, 0), label = "fan_owned_desired", format.stata = "%9.0g"), pc = structure(c(0, 0, 0), label = "pc_owned_desired", format.stata = "%9.0g"), tv = structure(c(1, 0, 1), label = "tv_owned_desired", format.stata = "%9.0g"), minutesSolid = structure(c(3, 2, 448), label = "stoveuseminutes_solids", format.stata = "%9.0g"), minutesClean = structure(c(0, 0, 0), label = "stoveuseminutes_clean", format.stata = "%9.0g"), stoveClean = structure(c(0, 0, 0), label = "stove_clean", format.stata = "%9.0g")), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"), label = "Written by R.")
以下是代码的样子:
all_summary <- allcountries %>%
select(Country, head_gender, female_educ, male_educ, w_visitpeople, w_vismarket, w_leavevill, fridge_owned_desired, radio_owned_desired, fan_owned_desired, pc_owned_desired, tv_owned_desired, stoveuseminutes_solids, stoveuseminutes_clean, stove_clean) %>%
rename(country = Country,
headGender = head_gender,
femaleEduc = female_educ,
maleEduc = male_educ,
wVispeople = w_visitpeople,
wVismarket = w_vismarket,
wLeavevill = w_leavevill,
fridge = fridge_owned_desired,
radio = radio_owned_desired,
fan = fan_owned_desired,
pc = pc_owned_desired,
tv = tv_owned_desired,
minutesSolid = stoveuseminutes_solids,
minutesClean = stoveuseminutes_clean,
stoveClean = stove_clean)
# 按国家分组
all_summary_grouped <- all_summary %>% group_by(country)
sumstats_all_grouped <-
all_summary_grouped %>%
summarise_each(funs(
n = sum(!is.na(.)),
min = min(., na.rm = TRUE),
max = max(., na.rm = TRUE),
mean = mean(., na.rm = TRUE)
))
# 重塑数据
sumstatsA <- sumstats_all_grouped %>%
gather(stat, val) %>%
separate(stat, into = c ("var", "stat"), sep = "_") %>%
spread(stat, val) %>%
select(var, n, min, max, mean)
# 四舍五入
sumstatsA = sumstatsA %>%
mutate(mean = round(as.numeric(mean),2))
# 生成表格
stargazer(
sumstatsA,
summary = F,
type = "text",
digits = 2,
header = F ,
title = "Summary statistics for Honduras, Nepal and Rwanda",
rownames = F,
out = "Manuscript/Tables/SummaryAll_grouped.tex")
错误发生在# 重塑数据
命令部分之后。
(在spread()
中的错误:
! 输出的每一行必须由唯一的键组合标识。
键在171行之间共享:
- 112, 113, 114
- 91, 92, 93
- 106, 107, 108)
英文:
I try to generate a summary statistics table for latex in R with stargazer
. The table should contain the summary statistics sorted by three subgroups (Rwanda/Honduras/Nepal).
It worked out fine, when I did seperate tables only for the subgroup. I thought maybe the country variable is the problem.
The all_summary
data frame looks like this:
structure(list(country = structure(c("Honduras", "Nepal", "Rwanda"
), label = "Country", format.stata = "%8s"), headGender = structure(c(0,
1, 0), label = "head_gender", format.stata = "%9.0g"), femaleEduc = structure(c(1,
2, 2), label = "female_educ", format.stata = "%9.0g"), maleEduc = structure(c(1,
1, 2), label = "male_educ", format.stata = "%9.0g"), wVispeople = structure(c(0,
1, 0), label = "w_visitpeople", format.stata = "%9.0g"), wVismarket = structure(c(0,
1, 1), label = "w_vismarket", format.stata = "%9.0g"), wLeavevill = structure(c(0,
1, 0), label = "w_leavevill", format.stata = "%9.0g"), fridge = structure(c(1,
0, 0), label = "fridge_owned_desired", format.stata = "%9.0g"),
radio = structure(c(1, 1, 1), label = "radio_owned_desired", format.stata = "%9.0g"),
fan = structure(c(0, 0, 0), label = "fan_owned_desired", format.stata = "%9.0g"),
pc = structure(c(0, 0, 0), label = "pc_owned_desired", format.stata = "%9.0g"),
tv = structure(c(1, 0, 1), label = "tv_owned_desired", format.stata = "%9.0g"),
minutesSolid = structure(c(3, 2, 448), label = "stoveuseminutes_solids", format.stata = "%9.0g"),
minutesClean = structure(c(0, 0, 0), label = "stoveuseminutes_clean", format.stata = "%9.0g"),
stoveClean = structure(c(0, 0, 0), label = "stove_clean", format.stata = "%9.0g")), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"), label = "Written by R.")
This is how the code looks like:
all_summary <- allcountries %>%
select(Country, head_gender, female_educ, male_educ, w_visitpeople, w_vismarket, w_leavevill, fridge_owned_desired, radio_owned_desired, fan_owned_desired, pc_owned_desired, tv_owned_desired, stoveuseminutes_solids, stoveuseminutes_clean, stove_clean) %>%
rename(country = Country,
headGender = head_gender,
femaleEduc = female_educ,
maleEduc = male_educ,
wVispeople = w_visitpeople,
wVismarket = w_vismarket,
wLeavevill = w_leavevill,
fridge = fridge_owned_desired,
radio = radio_owned_desired,
fan = fan_owned_desired,
pc = pc_owned_desired,
tv = tv_owned_desired,
minutesSolid = stoveuseminutes_solids,
minutesClean = stoveuseminutes_clean,
stoveClean = stove_clean)
#Group by country
all_summary_grouped <- all_summary %>% group_by(country)
sumstats_all_grouped <-
all_summary_grouped %>%
summarise_each(funs(
n = sum(!is.na(.)),
min = min(., na.rm = TRUE),
max = max(., na.rm = TRUE),
mean = mean(., na.rm = TRUE)
))
#Reshape data
sumstatsA <- sumstats_all_grouped %>%
gather(stat, val) %>%
separate(stat, into = c ("var", "stat"), sep = "_") %>%
spread(stat, val) %>%
select(var, n, min, max, mean)
#Round
sumstatsA = sumstatsA %>%
mutate(mean = round(as.numeric(mean),2))
#produce table
stargazer(
sumstatsA,
summary = F,
type = "text",
digits = 2,
header = F ,
title = "Summary statistics for Honduras, Nepal and Rwanda",
rownames = F,
out = "Manuscript/Tables/SummaryAll_grouped.tex")
The error happens after the #Reshape data command section
(Error in spread()
:
! Each row of output must be identified by a unique combination of keys.
Keys are shared for 171 rows:
- 112, 113, 114
- 91, 92, 93
- 106, 107, 108
(to be continued))
答案1
得分: 0
以下是一些最小的原始数据:
library(tidyverse)
data(mtcars)
as_tibble(mtcars)
这是gtsummary
包,它可以生成每个组(例如,cyl
)的简单摘要统计信息,默认以HTML格式呈现(您可以对表格进行各种修改):
library(gtsummary)
mtcars %>% tbl_summary(by = cyl)
有选项可以将其打印为.tex文件:
# 打印为.tex文件
as_kable_extra(mtcars %>% tbl_summary(by = cyl), format = "latex")
以下是可以通过选项指定的各种统计数据,现在显示非缺失值的数量,均值,中位数,P25和P75以及从最小值到最大值的范围:
mtcars %>% tbl_summary(by = am,
type = all_continuous() ~ "continuous2",
statistic = all_continuous() ~ c(
"{N_nonmiss}",
"{mean}",
"{median} ({p25}, {p75})",
"{min}, {max}"
))
请注意,以上是一些R代码的片段和描述,无法提供完整的交互式演示。如果您需要更多详细信息或特定翻译,请提出具体问题。
英文:
Here is some minimal raw data:
library(tidyverse)
data(mtcars)
as_tibble(mtcars)
# A tibble: 32 × 11
mpg cyl disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
# … with 22 more rows
# ℹ Use `print(n = ...)` to see more rows
This is the gtsummary
package that produces easy summary statistics per group (e.g. cyl
) in default html format (you can have various modifications for the table):
library(gtsummary)
mtcars %>% tbl_summary(by = cyl)
There are options to print this as .tex:
# print as tex
as_kable_extra(mtcars %>% tbl_summary(by = cyl), format = "latex")
\begin{tabular}{l|c|c|c}
\hline
\textbf{Characteristic} & \textbf{4}, N = 11 & \textbf{6}, N = 7 & \textbf{8}, N = 14\\
\hline
mpg & 26.0 (22.8, 30.4) & 19.7 (18.6, 21.0) & 15.2 (14.4, 16.2)\\
\hline
disp & 108 (79, 121) & 168 (160, 196) & 350 (302, 390)\\
\hline
hp & 91 (66, 96) & 110 (110, 123) & 192 (176, 241)\\
\hline
drat & 4.08 (3.81, 4.16) & 3.90 (3.35, 3.91) & 3.12 (3.07, 3.22)\\
\hline
wt & 2.20 (1.89, 2.62) & 3.21 (2.82, 3.44) & 3.76 (3.53, 4.01)\\
\hline
qsec & 18.90 (18.56, 19.95) & 18.30 (16.74, 19.17) & 17.18 (16.10, 17.56)\\
\hline
vs & 10 (91\%) & 4 (57\%) & 0 (0\%)\\
\hline
am & 8 (73\%) & 3 (43\%) & 2 (14\%)\\
\hline
gear & & & \\
\hline
\hspace{1em}3 & 1 (9.1\%) & 2 (29\%) & 12 (86\%)\\
\hline
\hspace{1em}4 & 8 (73\%) & 4 (57\%) & 0 (0\%)\\
\hline
\hspace{1em}5 & 2 (18\%) & 1 (14\%) & 2 (14\%)\\
\hline
carb & & & \\
\hline
\hspace{1em}1 & 5 (45\%) & 2 (29\%) & 0 (0\%)\\
\hline
\hspace{1em}2 & 6 (55\%) & 0 (0\%) & 4 (29\%)\\
\hline
\hspace{1em}3 & 0 (0\%) & 0 (0\%) & 3 (21\%)\\
\hline
\hspace{1em}4 & 0 (0\%) & 4 (57\%) & 6 (43\%)\\
\hline
\hspace{1em}6 & 0 (0\%) & 1 (14\%) & 0 (0\%)\\
\hline
\hspace{1em}8 & 0 (0\%) & 0 (0\%) & 1 (7.1\%)\\
\hline
\multicolumn{4}{l}{\rule{0pt}{1em}\textsuperscript{1} Median (IQR); n (\%)}\\
\end{tabular}
Update
Here is a list of various statistics that you can specify via options. It now shows the number of non-missings, the mean, the median, p25 and p75 and the range from min to max.
mtcars %>% tbl_summary(by = am,
type = all_continuous() ~ "continuous2",
statistic = all_continuous() ~ c(
"{N_nonmiss}",
"{mean}",
"{median} ({p25}, {p75})",
"{min}, {max}"
))
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论