尝试从网页抓取数据创建数据框。

huangapple go评论64阅读模式
英文:

Try to make dataframe from web scraping

问题

I Try to make dataframe with the data from this url, https://api.finance.naver.com/siseJson.naver?symbol=005930&requestType=1&startTime=20220312&endTime=20230517&timeframe=day, in r studio.

rm(list = ls(all=T))

library(jsonlite)
library(dplyr)

url <- "https://api.finance.naver.com/siseJson.naver?symbol=005930&amp;requestType=1&amp;startTime=20220312&amp;endTime=20230517&amp;timeframe=day"

data <- readLines(url, warn = FALSE)

print(data)

data_t <- gsub("\t","",data)

print(data_t)

Conclusionly, data is printed like this.

[1] ""
  [2] " [['날짜', '시가', '고가', '저가', '종가', '거래량', '외국인소진율'],"
  [3] ""
  [4] ""
  [5] ""
  [6] ""
  [7] "[\"20220314\", 70000, 70200, 69600, 70200, 9040993, 51.84],"
  [8] ""
  [9] "[\"20220315\", 69800, 70100, 69500, 69500, 10258562, 51.8],"
 [10] ""
 [11] "[\"20220316\", 70200, 70500, 69700, 70400, 10175750, 51.79],"
 [12] ""
 [13] "[\"20220317\", 71200, 71800, 70900, 71200, 17646315, 51.79],"
 [14] ""
 [15] "[\"20220318\", 70600, 70900, 70200, 70700, 14410038, 51.79],"

how to pick a data and connect correctly.

how to write code and complete it?

Please to make a code for dataframe from the url.

英文:

I Try to make dataframe with the data from this url,https://api.finance.naver.com/siseJson.naver?symbol=005930&requestType=1&startTime=20220312&endTime=20230517&timeframe=day, in r studio.


rm(list = ls(all=T))

library(jsonlite)
library(dplyr)

url &lt;- &quot;https://api.finance.naver.com/siseJson.naver?symbol=005930&amp;requestType=1&amp;startTime=20220312&amp;endTime=20230517&amp;timeframe=day&quot;

data &lt;- readLines(url, warn = FALSE)

print(data)

data_t &lt;- gsub(&quot;\t&quot;,&quot;&quot;,data)

print(data_t)

Conclusionly, data is printed like this.

[1] &quot;&quot;                                                                     
  [2] &quot; [[&#39;날짜&#39;, &#39;시가&#39;, &#39;고가&#39;, &#39;저가&#39;, &#39;종가&#39;, &#39;거래량&#39;, &#39;외국인소진율&#39;],&quot;
  [3] &quot;&quot;                                                                     
  [4] &quot;&quot;                                                                     
  [5] &quot;&quot;                                                                     
  [6] &quot;&quot;                                                                     
  [7] &quot;[\&quot;20220314\&quot;, 70000, 70200, 69600, 70200, 9040993, 51.84],&quot;          
  [8] &quot;&quot;                                                                     
  [9] &quot;[\&quot;20220315\&quot;, 69800, 70100, 69500, 69500, 10258562, 51.8],&quot;          
 [10] &quot;&quot;                                                                     
 [11] &quot;[\&quot;20220316\&quot;, 70200, 70500, 69700, 70400, 10175750, 51.79],&quot;         
 [12] &quot;&quot;                                                                     
 [13] &quot;[\&quot;20220317\&quot;, 71200, 71800, 70900, 71200, 17646315, 51.79],&quot;         
 [14] &quot;&quot;                                                                     
 [15] &quot;[\&quot;20220318\&quot;, 70600, 70900, 70200, 70700, 14410038, 51.79],&quot; 

how to pick a data and connect correctly.

how to write code and complete it?

Please to make a code for dataframe from the url.

答案1

得分: 1

以下是您要翻译的内容:

library(dplyr)
library(stringr)

url <- "https://api.finance.naver.com/siseJson.naver?symbol=005930&requestType=1&startTime=20220312&endTime=20230517&timeframe=day"

data <- readLines(url, warn = FALSE)

当我们移除制表符\t时,同时也可以去掉括号,而 strsplit() 允许我们将行拆分为向量,其中每个未来的数据框列都是单独的条目。

data2 <- 
  gsub("\t|\\[|\\]","",data) %> 
  strsplit(", ")

让我们从第二行获取列名并从数据对象中删除该行。

column_names <- 
  data2[[2]] %> 
  trimws() %> 
  str_remove_all("'")

data3 <- 
  data2[-2]

删除空行。

data4 <- 
  data3[lengths(data3) != 0]

为每个数据向量添加列名,并使用 dplyr::bind_rows() 绑定到数据帧中。

lapply(data4,
       setNames,
       column_names) %> 
  bind_rows() %> 
  mutate(날짜 = str_remove_all(날짜, "\\\\"))

#> # A tibble: 294 × 7
#>    날짜     시가  고가  저가  종가  거래량   `외국인소진율,`
#>    <chr>    <chr> <chr> <chr> <chr> <chr>    <chr>          
#>  1 20220314 70000 70200 69600 70200 9040993  51.84,         
#>  2 20220315 69800 70100 69500 69500 10258562 51.8,          
#>  3 20220316 70200 70500 69700 70400 10175750 51.79,         
#>  4 20220317 71200 71800 70900 71200 17646315 51.79,         
#>  5 20220318 70600 70900 70200 70700 14410038 51.79,         
#>  6 20220321 70900 71000 69900 69900 11169002 51.75,         
#>  7 20220322 69900 70500 69900 70300 9402666  51.75,         
#>  8 20220323 70600 71200 70300 70500 12398025 51.74,         
#>  9 20220324 69600 70300 69600 69800 37943357 51.97,         
#> 10 20220325 70100 70200 69600 69800 12986010 51.91,         
#> # ℹ 284 more rows
英文:
library(dplyr)
library(stringr)

url &lt;- &quot;https://api.finance.naver.com/siseJson.naver?symbol=005930&amp;requestType=1&amp;startTime=20220312&amp;endTime=20230517&amp;timeframe=day&quot;

data &lt;- readLines(url, warn = FALSE)

When we remove the tabs \t we can get rid of the brackets at the same time
and strsplit() allows us to break up the lines in to vectors where each
future dataframe column is a separate entry.

data2 &lt;- 
  gsub(&quot;\t|\\[|\\]&quot;,&quot;&quot;,data) |&gt; 
  strsplit(&quot;, &quot;)

Let’s get the column names from the second line and drop the line from the
data object

column_names &lt;- 
  data2[[2]] |&gt; 
  trimws() |&gt; 
  str_remove_all(&quot;&#39;&quot;)

data3 &lt;- 
  data2[-2]

Drop empty rows.

data4 &lt;- 
  data3[lengths(data3) != 0]

Add column names to each data vector and bind it into a data.frame with
dplyr::bind_rows()

lapply(data4,
       setNames,
       column_names) |&gt; 
  bind_rows() |&gt; 
  mutate(날짜 = str_remove_all(날짜, &quot;\\\&quot;&quot;))
#&gt; # A tibble: 294 &#215; 7
#&gt;    날짜     시가  고가  저가  종가  거래량   `외국인소진율,`
#&gt;    &lt;chr&gt;    &lt;chr&gt; &lt;chr&gt; &lt;chr&gt; &lt;chr&gt; &lt;chr&gt;    &lt;chr&gt;          
#&gt;  1 20220314 70000 70200 69600 70200 9040993  51.84,         
#&gt;  2 20220315 69800 70100 69500 69500 10258562 51.8,          
#&gt;  3 20220316 70200 70500 69700 70400 10175750 51.79,         
#&gt;  4 20220317 71200 71800 70900 71200 17646315 51.79,         
#&gt;  5 20220318 70600 70900 70200 70700 14410038 51.79,         
#&gt;  6 20220321 70900 71000 69900 69900 11169002 51.75,         
#&gt;  7 20220322 69900 70500 69900 70300 9402666  51.75,         
#&gt;  8 20220323 70600 71200 70300 70500 12398025 51.74,         
#&gt;  9 20220324 69600 70300 69600 69800 37943357 51.97,         
#&gt; 10 20220325 70100 70200 69600 69800 12986010 51.91,         
#&gt; # ℹ 284 more rows

答案2

得分: 1

library(dplyr)

"https://api.finance.naver.com/siseJson.naver?symbol=005930&amp;requestType=1&amp;startTime=20220312&amp;endTime=20230517&amp;timeframe=day" |>
  readr::read_csv(col_names = c(
    '날짜', '시가', '고가', '저가', '종가', '거래량', '외국인소진율'
  )) |
  mutate(across(where(is.character), readr::parse_number)) |
  select(-last_col()) |
  slice(2:n())

Results:

# A tibble: 295 × 7
       날짜    시가    고가    저가    종가     거래량 외국인소진율
    <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>        <dbl>
 1 20220314 70000 70200 69600 70200  9040993         51.8
 2 20220315 69800 70100 69500 69500 10258562         51.8
 3 20220316 70200 70500 69700 70400 10175750         51.8
 4 20220317 71200 71800 70900 71200 17646315         51.8
 5 20220318 70600 70900 70200 70700 14410038         51.8
 6 20220321 70900 71000 69900 69900 11169002         51.8
 7 20220322 69900 70500 69900 70300  9402666         51.8
 8 20220323 70600 71200 70300 70500 12398025         51.7
 9 20220324 69600 70300 69600 69800 37943357         52.0
10 20220325 70100 70200 69600 69800 12986010         51.9
英文:
library(dplyr)

&quot;https://api.finance.naver.com/siseJson.naver?symbol=005930&amp;requestType=1&amp;startTime=20220312&amp;endTime=20230517&amp;timeframe=day&quot; |&gt; 
  readr::read_csv(col_names = c(
    &#39;날짜&#39;, &#39;시가&#39;, &#39;고가&#39;, &#39;저가&#39;, &#39;종가&#39;, &#39;거래량&#39;, &#39;외국인소진율&#39;
  )) |&gt; 
  mutate(across(where(is.character), readr::parse_number)) |&gt; 
  select(-last_col()) |&gt; 
  slice(2:n())

Results:

# A tibble: 295 &#215; 7
       날짜  시가  고가  저가  종가   거래량 외국인소진율
      &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt;    &lt;dbl&gt;        &lt;dbl&gt;
 1 20220314 70000 70200 69600 70200  9040993         51.8
 2 20220315 69800 70100 69500 69500 10258562         51.8
 3 20220316 70200 70500 69700 70400 10175750         51.8
 4 20220317 71200 71800 70900 71200 17646315         51.8
 5 20220318 70600 70900 70200 70700 14410038         51.8
 6 20220321 70900 71000 69900 69900 11169002         51.8
 7 20220322 69900 70500 69900 70300  9402666         51.8
 8 20220323 70600 71200 70300 70500 12398025         51.7
 9 20220324 69600 70300 69600 69800 37943357         52.0
10 20220325 70100 70200 69600 69800 12986010         51.9

huangapple
  • 本文由 发表于 2023年5月17日 20:03:09
  • 转载请务必保留本文链接:https://go.coder-hub.com/76271886.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定