英文:
Truncating numbers but they are not showing up the right number of characters
问题
我正在尝试提取hs10
变量的前6个数字,但对于某些情况,我只得到了5个字符 - 这是否有原因?
我已经使用了这个函数:
us_chn_tariffs_18$HS6 <- as.numeric(substr(format(us_chn_tariffs_18$hs10, scientific = F), 1, 6))
dput
如下:
structure(list(hs10 = structure(c(208100000, 208902500, 301110010,
301110020, 301110090, 301990390, 302230000, 302290110, 302290190,
302420000, 302455000, 302595010, 302595090, 302740000, 302845000,
302895077, 302912000, 303120022, 303120032, 303230000), label = "HS10 Product Code", format.stata = "%10.0f"),
tariff_max = structure(c(0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
0.1), label = "US Import Tariff Increase (max)", format.stata = "%9.3f"),
tariff_scaled = structure(c(0.0333333350718021, 0.0333333350718021,
0.0333333350718021, 0.0333333350718021, 0.0333333350718021,
0.0333333350718021, 0.0333333350718021, 0.0333333350718021,
0.0333333350718021, 0.0333333350718021, 0.0333333350718021,
0.0333333350718021, 0.0333333350718021, 0.0333333350718021,
0.0333333350718021, 0.0333333350718021, 0.0333333350718021,
0.0333333350718021, 0.0333333350718021, 0.0333333350718021
), label = "US Import Tariff Increase (scaled)", format.stata = "%9.3f"),
effective_mdate = structure(c(704, 704, 704, 704, 704, 704,
704, 704, 704, 704, 704, 704, 704, 704, 704, 704, 704, 704,
704, 704), label = "Month Variety First Targeted", format.stata = "%tm"),
month = c("9", "9", "9", "9", "9", "9", "9", "9", "9", "9",
"9", "9", "9", "9", "9", "9", "9", "9", "9", "9"), treated = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
HS6 = structure(c(20810, 20890, 30111, 30111, 30111, 30199,
30223, 30229, 30229, 30242, 30245, 30259, 30259, 30274, 30284,
30289, 30291, 30312, 30312, 30323), label = "HS10 Product Code", format.stata = "%10.0f")), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
谢谢
英文:
I am trying to extract the first 6 numbers for the hs10
variable, but I've been getting only 5 characters for some - is there a reason for this?
I have used this function
us_chn_tariffs_18$HS6 <- as.numeric(substr(format(us_chn_tariffs_18$hs10, scientific = F), 1, 6))
The dput
is:
structure(list(hs10 = structure(c(208100000, 208902500, 301110010,
301110020, 301110090, 301990390, 302230000, 302290110, 302290190,
302420000, 302455000, 302595010, 302595090, 302740000, 302845000,
302895077, 302912000, 303120022, 303120032, 303230000), label = "HS10 Product Code", format.stata = "%10.0f"),
tariff_max = structure(c(0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
0.1), label = "US Import Tariff Increase (max)", format.stata = "%9.3f"),
tariff_scaled = structure(c(0.0333333350718021, 0.0333333350718021,
0.0333333350718021, 0.0333333350718021, 0.0333333350718021,
0.0333333350718021, 0.0333333350718021, 0.0333333350718021,
0.0333333350718021, 0.0333333350718021, 0.0333333350718021,
0.0333333350718021, 0.0333333350718021, 0.0333333350718021,
0.0333333350718021, 0.0333333350718021, 0.0333333350718021,
0.0333333350718021, 0.0333333350718021, 0.0333333350718021
), label = "US Import Tariff Increase (scaled)", format.stata = "%9.3f"),
effective_mdate = structure(c(704, 704, 704, 704, 704, 704,
704, 704, 704, 704, 704, 704, 704, 704, 704, 704, 704, 704,
704, 704), label = "Month Variety First Targeted", format.stata = "%tm"),
month = c("9", "9", "9", "9", "9", "9", "9", "9", "9", "9",
"9", "9", "9", "9", "9", "9", "9", "9", "9", "9"), treated = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
HS6 = structure(c(20810, 20890, 30111, 30111, 30111, 30199,
30223, 30229, 30229, 30242, 30245, 30259, 30259, 30274, 30284,
30289, 30291, 30312, 30312, 30323), label = "HS10 Product Code", format.stata = "%10.0f")), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
Thank you
答案1
得分: 0
I think I can see where the problem comes in, but cannot reproduce it quite exactly (as I've not imported from Stata). The Stata print format is %10.0f
, which prints 10 characters, padding your digits with leading spaces. But your IDs are all 9 digits long, so the first character pulled from substr
is a space, which as.numeric
trims. Try reformating hs10
:
hs10 <- c(
208100000,
208902500,
301110010,
301110020,
301110090,
301990390,
302230000,
302290110,
302290190,
302420000,
302455000,
302595010,
302595090,
302740000,
302845000,
302895077,
302912000,
303120022,
303120032,
303230000
)
# Reproducing the values you're getting
hs10_not_working <- sprintf(fmt = "%10.0f", hs10)
as.numeric(substr(format(hs10_not_working, scientific = F), 1, 6))
#> [1] 20810 20890 30111 30111 30111 30199 30223 30229 30229 30242 30245 30259
#> [13] 30259 30274 30284 30289 30291 30312 30312 30323
# Corrected values
hs10_edited <- sprintf(fmt = "%-10.0f", hs10)
as.numeric(substr(format(hs10_edited, scientific = F), 1, 6))
#> [1] 208100 208902 301110 301110 301110 301990 302230 302290 302290 302420
#> [11] 302455 302595 302595 302740 302845 302895 302912 303120 303120 303230
The new sprintf(fmt = "%-10.0f", hs10)
part will left-align the numbers (-
character) which will then mean the first 6 digits are extracted even if the value is only 9 digits.
Another simple solution would be to trimws
before substr
ing your variable:
us_chn_tariffs_18$HS6 <- as.numeric(substr(trimws(format(us_chn_tariffs_18$hs10, scientific = F)), 1, 6))
英文:
I think I can see where the problem comes in, but cannot reproduce it quite exactly (as I've not imported from Stata). The Stata print format is %10.0f
, which prints 10 characters, padding your digits with leading spaces. But your IDs are all 9 digits long, so the first character pulled from substr
is a space, which as.numeric
trims. Try reformating hs10
:
hs10 <- c(
208100000,
208902500,
301110010,
301110020,
301110090,
301990390,
302230000,
302290110,
302290190,
302420000,
302455000,
302595010,
302595090,
302740000,
302845000,
302895077,
302912000,
303120022,
303120032,
303230000
)
# Reproducing the values you're getting
hs10_not_working <- sprintf(fmt = "%10.0f", hs10)
as.numeric(substr(format(hs10_not_working, scientific = F), 1, 6))
#> [1] 20810 20890 30111 30111 30111 30199 30223 30229 30229 30242 30245 30259
#> [13] 30259 30274 30284 30289 30291 30312 30312 30323
# Corrected values
hs10_edited <- sprintf(fmt = "%-10.0f", hs10)
as.numeric(substr(format(hs10_edited, scientific = F), 1, 6))
#> [1] 208100 208902 301110 301110 301110 301990 302230 302290 302290 302420
#> [11] 302455 302595 302595 302740 302845 302895 302912 303120 303120 303230
The new sprintf(fmt = "%-10.0f", hs10)
part will left-align the numbers (-
character) which will then mean the first 6 digits are extracted even if the value is only 9 digits.
Another simple solution would be to trimws
before substr
ing your variable:
us_chn_tariffs_18$HS6 <- as.numeric(substr(trimws(format(us_chn_tariffs_18$hs10, scientific = F)), 1, 6))
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论