使用R中的一个向量和同一数据集中的不同变量创建虚拟变量?

huangapple go评论64阅读模式
英文:

Create dummy variables using a vector and different variables from the same dataset in R?

问题

I understand that you want to translate the Stata code into R and create dummy variables for each "AGRUPA" variable based on the "agrucid10" vector. Here's the equivalent R code:

# Define the agrucid10 vector
agrucid10 <- c("A00_A09", "A15_A19", "A20_A28", "A30_A49", "A50_A64", "A65_A69", "A70_A74", "A75_A79", "A80_A89", "A90_A99",
               "B00_B09", "B15_B19", "B20_B24", "B25_B34", "B35_B49", "B50_B64", "B65_B83", "B85_B89", "B90_B94", "B95_B97", 
               "B99", "C00_C14", "C15_C26", "C30_C39", "C40_C41", "C43_C44", "C45_C49", "C50", "C51_C58", "C60_C63", "C64_C68", 
               "C69_C72", "C73_C75", "C76_C80", "C81_C96", "C97", "D00_D09", "D10_D36", "D37_D48", "D50_D53", "D55_D59", 
               "D60_D64", "D65_D69", "D70_D77", "D80_D89", "E00_E07", "E10_E14", "E15_E16", "E20_E35", "E40_E46", "E50_E64",
               "E65_E68", "E70_E90", "F00_F09", "F10_F19", "F20_F29", "F30_F39", "F40_F48", "F50_F59", "F60_F69", "F70_F79",
               "F80_F89", "F90_F98", "F99", "G00_G09", "G10_G13", "G20_G26", "G30_G32", "G35_G37", "G40_G47", "G50_G59", "G60_G64",
               "G70_G73", "G80_G83", "G90_G99", "H00_H06", "H10_H13", "H15_H22", "H25_H28", "H30_H36", "H40_H42", "H43_H45",
               "H46_H48", "H49_H52", "H53_H54", "H55_H59", "H60_H62", "H65_H75", "H80_H83", "H90_H95", "I00_I02", "I05_I09",
               "I10_I15", "I20_I25", "I26_I28", "I30_I52", "I60_I69", "I70_I79", "I80_I89", "I95_I99", "J00_J06", "J09_J18",
               "J20_J22", "J30_J39", "J40_J47", "J60_J70", "J80_J84", "J85_J86", "J90_J94", "J95_J99", "K00_K14", "K20_K31",
               "K35_K38", "K40_K46", "K50_K52", "K55_K63", "K65_K67", "K70_K77", "K80_K87", "K90_K93", "L00_L08", "L10_L14",
               "L20_L30", "L40_L45", "L50_L54", "L55_L59", "L60_L75", "L80_L99", "M00_M25", "M30_M36", "M40_M54", "M60_M79",
               "M80_M94", "M95_M99", "N00_N08", "N10_N16", "N17_N19", "N20_N23", "N25_N29", "N30_N39", "N40_N51", "N60_N64",
               "N70_N77", "N80_N98", "N99", "O00_O08", "O10_O16", "O20_O29", "O30_O48", "O60_O75", "O80_O84", "O85_O92", 
               "O94_O99", "P00_P04", "P05_P08", "P10_P15", "P20_P29", "P35_P39", "P50_P61", "P70_P74", "P75_P78", "P80_P83",
               "P90_P96", "Q00_Q07", "Q10_Q18", "Q20_Q28", "Q30_Q34", "Q35_Q37", "Q38_Q45", "Q50_Q56", "Q60_Q64", "Q65_Q79",
               "Q80_Q89", "Q90_Q99", "R00_R09", "R10_R19", "R20_R23", "R25_R29", "R30_R39", "R40_R46", "R47_R49", "R50_R69",
               "R70_R79", "R80_R82", "R83_R89", "R90_R94", "R95_R99", "S00_S09", "S10_S19", "S20_S29", "S30_S39", "S40_S49",
               "S50_S59", "S60_S69", "S70_S79", "S80_S89", "S90_S99", "T00_T07", "T08_T14", "T15_T19", "T20_T32", "T33_T35",
               "T36_T50", "T51_T65", "T66_T78", "T79", "T80_T88", "T90_T98", "V01_V99", "W00_X59", "X60_X84", "X85_Y09",
               "Y10_Y34", "Y35_Y36", "Y40_Y84", "Y85_Y89", "Y90_Y98", "Z00_Z13", "Z20_Z29", "Z30_Z39", "Z40_Z54", "Z55_Z65",
               "Z70_Z76", "Z80_Z99", "U00_U49", "U80_U89")

# Create dummy variables in R
for (agrupa_var in unique(data$AGRUPA2)) {
  for (agricud_value in agrucid10) {
    col_name <- paste0(agrupa_var, agricud_value)
    data[[col_name]] <- as.numeric(data$AGRUPA2 == agrupa_var & data$DO == agricud_value)
  }
}

# Repeat the process for AGRUPA3, AGRUPA4, and AGRUPA5

<details>
<summary>英文:</summary>

I&#39;m trying to &quot;translate&quot; a Stata do-file to R language and I&#39;m looking for the most efficient way to do it. My data looks like this:

    data &lt;- data.frame(DO = c(&quot;205145&quot;, &quot;587523&quot;, &#39;484351&#39;, &quot;453475&quot;, &quot;456546&quot;, &quot;15641334&quot;, &quot;465466&quot;, &#39;674646&#39;, &quot;564635&quot;, &quot;65255&quot;),
                      AGRUPA2 = c(&quot;B25_B34&quot;, &quot;O94_O99&quot;, &quot;A50_A64&quot;, &quot;B25_B34&quot;, &quot;B25_B34&quot;, &quot;U00_U49&quot;, &quot;B25_B34&quot;, &quot;O94_O99&quot;, &quot;R50_R69&quot;, &quot;J09_J18&quot;),
                      AGRUPA3 = c(&quot;O94_O99&quot;, &quot;A50_A64&quot;, &quot;B25_B34&quot;, &quot;B25_B34&quot;, &quot;U00_U49&quot;, &quot;B25_B34&quot;, &quot;O94_O99&quot;,&quot;R50_R69&quot;, &quot;J09_J18&quot;, &quot;C81_C96&quot;)))

The full dataset has 4 &quot;AGRUPA&quot; variables (AGRUPA2, AGRUPA3, AGRUPA4, AGRUPA5). The same happens with &quot;AGRUPB&quot;, &quot;AGRUPC&quot;, &quot;AGRUPD&quot;. And there&#39;s also AGRUPII that has 5 variables (AGRUPII2:AGRUPII6). Anyways, the vector contains the categories from &quot;AGRUPA&quot; variables.

The vector in question:

    agrucid10 &lt;-  as.character(expression(A00_A09, A15_A19, A20_A28, A30_A49, A50_A64, A65_A69, A70_A74, A75_A79, A80_A89, A90_A99,
                   B00_B09, B15_B19, B20_B24, B25_B34, B35_B49, B50_B64, B65_B83, B85_B89, B90_B94, B95_B97, 
                   B99, C00_C14, C15_C26, C30_C39, C40_C41, C43_C44, C45_C49, C50, C51_C58, C60_C63, C64_C68, 
                   C69_C72, C73_C75, C76_C80, C81_C96, C97, D00_D09, D10_D36, D37_D48, D50_D53, D55_D59, 
                   D60_D64, D65_D69, D70_D77, D80_D89, E00_E07, E10_E14, E15_E16, E20_E35, E40_E46, E50_E64,
                   E65_E68, E70_E90, F00_F09, F10_F19, F20_F29, F30_F39, F40_F48, F50_F59, F60_F69, F70_F79,
                   F80_F89, F90_F98, F99, G00_G09, G10_G13, G20_G26, G30_G32, G35_G37, G40_G47, G50_G59, G60_G64,
                   G70_G73, G80_G83, G90_G99, H00_H06, H10_H13, H15_H22, H25_H28, H30_H36, H40_H42, H43_H45,
                   H46_H48, H49_H52, H53_H54, H55_H59, H60_H62, H65_H75, H80_H83, H90_H95, I00_I02, I05_I09,
                   I10_I15, I20_I25, I26_I28, I30_I52, I60_I69, I70_I79, I80_I89, I95_I99, J00_J06, J09_J18,
                   J20_J22, J30_J39, J40_J47, J60_J70, J80_J84, J85_J86, J90_J94, J95_J99, K00_K14, K20_K31,
                   K35_K38, K40_K46, K50_K52, K55_K63, K65_K67, K70_K77, K80_K87, K90_K93, L00_L08, L10_L14,
                   L20_L30, L40_L45, L50_L54, L55_L59, L60_L75, L80_L99, M00_M25, M30_M36, M40_M54, M60_M79,
                   M80_M94, M95_M99, N00_N08, N10_N16, N17_N19, N20_N23, N25_N29, N30_N39, N40_N51, N60_N64,
                   N70_N77, N80_N98, N99, O00_O08, O10_O16, O20_O29, O30_O48, O60_O75, O80_O84, O85_O92, 
                   O94_O099, P00_P04, P05_P08, P10_P15, P20_P29, P35_P39, P50_P61, P70_P74, P75_P78, P80_P83,
                   P90_P96, Q00_Q07, Q10_Q18, Q20_Q28, Q30_Q34, Q35_Q37, Q38_Q45, Q50_Q56, Q60_Q64, Q65_Q79,
                   Q80_Q89, Q90_Q99, R00_R09, R10_R19, R20_R23, R25_R29, R30_R39, R40_R46, R47_R49, R50_R69,
                   R70_R79, R80_R82, R83_R89, R90_R94, R95_R99, S00_S09, S10_S19, S20_S29, S30_S39, S40_S49,
                   S50_S59, S60_S69, S70_S79, S80_S89, S90_S99, T00_T07, T08_T14, T15_T19, T20_T32, T33_T35,
                   T36_T50, T51_T65, T66_T78, T79, T80_T88, T90_T98, V01_V99, W00_X59, X60_X84, X85_Y09,
                   Y10_Y34, Y35_Y36, Y40_Y84, Y85_Y89, Y90_Y98, Z00_Z13, Z20_Z29, Z30_Z39, Z40_Z54, Z55_Z65,
                   Z70_Z76, Z80_Z99, U00_U49, U80_U89))

I need to create a dummy variable for each &quot;AGRUPA&quot; variable, adding each agrucid10 to the variable name. It would create something like this: &quot;AGRUPA2A00_A09&quot;, &quot;AGRUPA2A15_A19&quot;. And if the agrucid10 in question is contained in its respective &quot;AGRUPA&quot; variable, it receives 1 or else, 0. I thought about doing the following:

    data$AGRUPA2A00_A09 &lt;- ifelse(data$AGRUPA2 == &quot;A00_A09&quot;, 1, 0)
    data$AGRUPA3A00_A09 &lt;- ifelse(data$AGRUPA3 == &quot;A00_A09&quot;, 1, 0)

But it would take ages since there are many groups. Here&#39;s the Stata code if it helps:

    global agrucid10 A00_A09 A15_A19 A20_A28 A30_A49 A50_A64 A65_A69 A70_A74 A75_A79 A80_A89 A90_A99 B00_B09 B15_B19 B20_B24 B25_B34 B35_B49 B50_B64 B65_B83 B85_B89 B90_B94 B95_B97 B99 C00_C14 C15_C26 C30_C39 C40_C41 C43_C44 C45_C49 C50 C51_C58 C60_C63 C64_C68 C69_C72 C73_C75 C76_C80 C81_C96 C97 D00_D09 D10_D36 D37_D48 D50_D53 D55_D59 D60_D64 D65_D69 D70_D77 D80_D89 E00_E07 E10_E14 E15_E16 E20_E35 E40_E46 E50_E64 E65_E68 E70_E90 F00_F09 F10_F19 F20_F29 F30_F39 F40_F48 F50_F59 F60_F69 F70_F79 F80_F89 F90_F98 F99 G00_G09 G10_G13 G20_G26 G30_G32 G35_G37 G40_G47 G50_G59 G60_G64 G70_G73 G80_G83 G90_G99 H00_H06 H10_H13 H15_H22 H25_H28 H30_H36 H40_H42 H43_H45 H46_H48 H49_H52 H53_H54 H55_H59 H60_H62 H65_H75 H80_H83 H90_H95 I00_I02 I05_I09 I10_I15 I20_I25 I26_I28 I30_I52 I60_I69 I70_I79 I80_I89 I95_I99 J00_J06 J09_J18 J20_J22 J30_J39 J40_J47 J60_J70 J80_J84 J85_J86 J90_J94 J95_J99 K00_K14 K20_K31 K35_K38 K40_K46 K50_K52 K55_K63 K65_K67 K70_K77 K80_K87 K90_K93 L00_L08 L10_L14 L20_L30 L40_L45 L50_L54 L55_L59 L60_L75 L80_L99 M00_M25 M30_M36 M40_M54 M60_M79 M80_M94 M95_M99 N00_N08 N10_N16 N17_N19 N20_N23 N25_N29 N30_N39 N40_N51 N60_N64 N70_N77 N80_N98 N99 O00_O08 O10_O16 O20_O29 O30_O48 O60_O75 O80_O84 O85_O92 O94_O099 P00_P04 P05_P08 P10_P15 P20_P29 P35_P39 P50_P61 P70_P74 P75_P78 P80_P83 P90_P96 Q00_Q07 Q10_Q18 Q20_Q28 Q30_Q34 Q35_Q37 Q38_Q45 Q50_Q56 Q60_Q64 Q65_Q79 Q80_Q89 Q90_Q99 R00_R09 R10_R19 R20_R23 R25_R29 R30_R39 R40_R46 R47_R49 R50_R69 R70_R79 R80_R82 R83_R89 R90_R94 R95_R99 S00_S09 S10_S19 S20_S29 S30_S39 S40_S49 S50_S59 S60_S69 S70_S79 S80_S89 S90_S99 T00_T07 T08_T14 T15_T19 T20_T32 T33_T35 T36_T50 T51_T65 T66_T78 T79 T80_T88 T90_T98 V01_V99 W00_X59 X60_X84 X85_Y09 Y10_Y34 Y35_Y36 Y40_Y84 Y85_Y89 Y90_Y98 Z00_Z13 Z20_Z29 Z30_Z39 Z40_Z54 Z55_Z65 Z70_Z76 Z80_Z99 U00_U49 U80_U89
     
    foreach x in &quot;a&quot; &quot;b&quot; &quot;c&quot; &quot;d&quot; &quot;ii&quot; {
    forval pos = 2/7 {
    foreach ag of global agrucid10 {
    generate agrupam`x&#39;`pos&#39;`ag&#39;=0
    replace agrupam`x&#39;`pos&#39;`ag&#39;=1 if agrup`x&#39;`pos&#39;==&quot;`ag&#39;&quot;
    }
    }
    }






</details>


# 答案1
**得分**: 1

以下是您要翻译的内容:

使用您的`data`作为示例,由于您想要将两列转换为虚拟变量,我们还必须将`AGRUPA`的名称与它们的各个级别相统一。这样,您可以跟踪每个级别来自哪个`AGRUPA`变量。为此,我们首先需要对数据进行一些操作,将其转换为长格式,统一名称和级别,最后将统一的级别转换为保存在最终数据集的每一列中的虚拟变量:

```R
library(tidyverse)
data_dummy <- data %>%
  gather(AGRUPA, val, -DO) %>%
  unite("AGR", AGRUPA:val, sep = "_", remove = TRUE) %>%
  mutate(dummy = 1) %>%
  spread(AGR, value = dummy, fill = 0, drop = TRUE, convert = FALSE)

结果数据的一部分如下所示:

head(data_dummy) [1:5]
        DO AGRUPA2_A50_A64 AGRUPA2_B25_B34 AGRUPA2_J09_J18 AGRUPA2_O94_O99
1 15641334               0               0               0               0
2   205145               0               1               0               0
3   453475               0               1               0               0
4   456546               0               1               0               0
5   465466               0               1               0               0
6   484351               1               0               0               0

希望这对您有所帮助。

英文:

Taking your data as example, as you want two columns to convert to dummy variables, we must also unite the AGRUPA names to each of their levels. So you can track which level comes from which AGRUPA variable. To do this we first need some manipulation converting your data to long format, uniting names and levels, and finally converting united levels to dummy variables saved in each column in the final data set:

library(tidyverse)
data_dummy &lt;- data %&gt;% gather(AGRUPA,val, -DO) %&gt;% 
unite(&quot;AGR&quot;,AGRUPA:val,sep = &quot;_&quot;,remove = T) %&gt;%
mutate(dummy=1) %&gt;%
spread(AGR, value = dummy, fill = 0, drop = T, convert = F)

some part of resulting data

head(data_dummy) [1:5]
DO AGRUPA2_A50_A64 AGRUPA2_B25_B34 AGRUPA2_J09_J18 AGRUPA2_O94_O99
1 15641334               0               0               0               0
2   205145               0               1               0               0
3   453475               0               1               0               0
4   456546               0               1               0               0
5   465466               0               1               0               0
6   484351               1               0               0               0

答案2

得分: 1

The provided code appears to be written in R, and it seems to involve data manipulation using the tidyverse package. However, I can't provide a direct translation of the code as it contains specific R functions and variable names. If you have any questions or need assistance with understanding or modifying the code, please feel free to ask.

英文:
library(tidyverse)
data %&gt;% 
pivot_longer(-DO) %&gt;%
mutate(flag = 1) %&gt;%
complete(DO, name, value = agrucid10, fill = list(flag = 0)) %&gt;%
pivot_wider(names_from = c(name, value), values_from = flag)

Result

# A tibble: 10 &#215; 455
DO       AGRUPA2…&#185; AGRUP…&#178; AGRUP…&#179; AGRUP…⁴ AGRUP…⁵ AGRUP…⁶ AGRUP…⁷ AGRUP…⁸ AGRUP…⁹ AGRUP…˟ AGRUP…˟ AGRUP…˟ AGRUP…˟ AGRUP…˟
&lt;chr&gt;        &lt;dbl&gt;   &lt;dbl&gt;   &lt;dbl&gt;   &lt;dbl&gt;   &lt;dbl&gt;   &lt;dbl&gt;   &lt;dbl&gt;   &lt;dbl&gt;   &lt;dbl&gt;   &lt;dbl&gt;   &lt;dbl&gt;   &lt;dbl&gt;   &lt;dbl&gt;   &lt;dbl&gt;
1 15641334         0       0       0       0       0       0       0       0       0       0       0       0       0       0
2 205145           0       0       0       0       0       0       0       0       0       0       0       0       0       1
3 453475           0       0       0       0       0       0       0       0       0       0       0       0       0       1
4 456546           0       0       0       0       0       0       0       0       0       0       0       0       0       1
5 465466           0       0       0       0       0       0       0       0       0       0       0       0       0       1
6 484351           0       0       0       0       1       0       0       0       0       0       0       0       0       0
7 564635           0       0       0       0       0       0       0       0       0       0       0       0       0       0
8 587523           0       0       0       0       0       0       0       0       0       0       0       0       0       0
9 65255            0       0       0       0       0       0       0       0       0       0       0       0       0       0
10 674646           0       0       0       0       0       0       0       0       0       0       0       0       0       0
# … with 440 more variables: AGRUPA2_B35_B49 &lt;dbl&gt;, AGRUPA2_B50_B64 &lt;dbl&gt;, AGRUPA2_B65_B83 &lt;dbl&gt;, AGRUPA2_B85_B89 &lt;dbl&gt;,
#   AGRUPA2_B90_B94 &lt;dbl&gt;, AGRUPA2_B95_B97 &lt;dbl&gt;, AGRUPA2_B99 &lt;dbl&gt;, AGRUPA2_C00_C14 &lt;dbl&gt;, AGRUPA2_C15_C26 &lt;dbl&gt;,
#   AGRUPA2_C30_C39 &lt;dbl&gt;, AGRUPA2_C40_C41 &lt;dbl&gt;, AGRUPA2_C43_C44 &lt;dbl&gt;, AGRUPA2_C45_C49 &lt;dbl&gt;, AGRUPA2_C50 &lt;dbl&gt;,
#   AGRUPA2_C51_C58 &lt;dbl&gt;, AGRUPA2_C60_C63 &lt;dbl&gt;, AGRUPA2_C64_C68 &lt;dbl&gt;, AGRUPA2_C69_C72 &lt;dbl&gt;, AGRUPA2_C73_C75 &lt;dbl&gt;,
#   AGRUPA2_C76_C80 &lt;dbl&gt;, AGRUPA2_C81_C96 &lt;dbl&gt;, AGRUPA2_C97 &lt;dbl&gt;, AGRUPA2_D00_D09 &lt;dbl&gt;, AGRUPA2_D10_D36 &lt;dbl&gt;,
#   AGRUPA2_D37_D48 &lt;dbl&gt;, AGRUPA2_D50_D53 &lt;dbl&gt;, AGRUPA2_D55_D59 &lt;dbl&gt;, AGRUPA2_D60_D64 &lt;dbl&gt;, AGRUPA2_D65_D69 &lt;dbl&gt;,
#   AGRUPA2_D70_D77 &lt;dbl&gt;, AGRUPA2_D80_D89 &lt;dbl&gt;, AGRUPA2_E00_E07 &lt;dbl&gt;, AGRUPA2_E10_E14 &lt;dbl&gt;, AGRUPA2_E15_E16 &lt;dbl&gt;, …
# ℹ Use `colnames()` to see all variable names

huangapple
  • 本文由 发表于 2023年5月7日 00:29:40
  • 转载请务必保留本文链接:https://go.coder-hub.com/76189986.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定