问题

我正在尝试将此 XML 转换为 R 中的数据框：

https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml

library(xml2)
library(tidyverse)

fileurl <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml"
xmllist <- as_list(read_xml(fileurl)) 
xml_df = tibble::as_tibble(xmllist) %>%
        unnest_longer(response)

row_wider = xml_df %>%
        unnest_wider(response)

row_df = row_wider %>%
        unnest(cols = names(.)) %>%
        unnest(cols = names(.)) %>%
        readr::type_convert()

问题在于 'location_1' 列是一个字典，在我展开时会显示为 NA。如何将此字典的每个值放入此列？非常感谢您的帮助。

英文:

I am attempting to convert this xml to a dataframe in R:

https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml

library(xml2)
library(tidyverse)

fileurl &lt;- &quot;https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml&quot;
xmllist &lt;- as_list(read_xml(fileurl)) 
xml_df = tibble::as_tibble(xmllist) %&gt;%
        unnest_longer(response)

row_wider = xml_df %&gt;%
        unnest_wider(response)

row_df = row_wider %&gt;%
        unnest(cols = names(.)) %&gt;%
        unnest(cols = names(.)) %&gt;%
        readr::type_convert()

The issue is that the 'location_1' column is a dictionary and shows up as NA when I unnest. How can I get each of the values of this dictionary into this column? Any help is much appreciated, thanks.

答案1

得分: 3

以下是您要翻译的内容：

"所请求的地址数据以JSON格式存储在XML节点的属性中。接下来我会提取这些属性，将JSON数据转换并合并。然后，生成的数据框可以绑定到之前执行的工作中。

有关详细信息，请参阅注释。

library(xml2)
library(jsonlite)
library(tidyverse)

# 将文件读取为XML
page <- read_xml(fileurl)

# 提取餐厅节点到一个向量中
restaurants <- page %>% xml_find_all(".//row/row")
# 获取存储为属性数据的地址数据
addresses <- restaurants %>% xml_find_first(".//location_1") %>% xml_attr("human_address")

# 这是一个JSON数据结构的向量
# 将JSON转换为数据框

dfs <- lapply(addresses, function(address){
   address %>% fromJSON() %>% as.data.frame()
})
# 合并所有数据框
answer <- bind_rows(dfs)

answer

                     address      city state zip
1           4509 BELAIR ROAD Baltimore    MD    
2              1919 FLEET ST Baltimore    MD    
3             2844 HUDSON ST Baltimore    MD    
4            3998 ROLAND AVE Baltimore    MD    
5         2481 frederick ave Baltimore    MD    
6            2722 HARFORD RD Baltimore    MD    
```"

<details>
<summary>英文:</summary>

The requested address data is stored as JSON in the XML node&#39;s attribute.   
Below I extract the attribute, convert the JSON and then merge.  The resulting dataframe can then be binded to the work performed did above.  

See comments for details.   

```r
library(xml2)
library(jsonlite)
library(tidyverse)

#read file as xml
page &lt;- read_xml(fileurl)

#extract out the restaurant nodes into a vector
restaurants &lt;- page %&gt;% xml_find_all(&quot;.//row/row&quot;)
#get the address data which is stored as attribute data
addresses &lt;- restaurants %&gt;% xml_find_first(&quot;.//location_1&quot;) %&gt;% xml_attr(&quot;human_address&quot;) 

#this is a vector of JSON data structures
#convert the JSON to a data frame

dfs &lt;- lapply(addresses, function(address){
   address %&gt;% fromJSON() %&gt;% as.data.frame()
})
#combine all of the data frames
answer&lt;- bind_rows(dfs)

answer

                     address      city state zip
1           4509 BELAIR ROAD Baltimore    MD    
2              1919 FLEET ST Baltimore    MD    
3             2844 HUDSON ST Baltimore    MD    
4            3998 ROLAND AVE Baltimore    MD    
5         2481 frederick ave Baltimore    MD    
6            2722 HARFORD RD Baltimore    MD

答案2

得分: 2

location_1列是一个空列表（因此你会得到NA值），具有两个属性：human_address，它是一个JSON字符串，和一个逻辑值needs_recoding。获得你想要的结果的一个选项是首先提取这些属性的内容并将它们存储在一个list中。然后，你可以使用两个unnest_wider来展开这个列表列。

library(xml2)
library(tidyverse)

parse_location_1 <- function(x) {
  x$location_1 <- list(
    human_address = jsonlite::fromJSON(attr(x$location_1, "human_address")),
    needs_recoding = attr(x$location_1, "needs_recoding")
  )
  x
}

fileurl <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml"
xmllist <- as_list(read_xml(fileurl))
xml_df <- tibble::as_tibble(xmllist) %>%
  unnest_longer(response) %>|
  mutate(response = map(
    response, parse_location_1
  ))

row_wider <- xml_df %>%
  unnest_wider(response) %>|
  unnest_wider(location_1) %>| 
  unnest_wider(human_address)

row_df <- row_wider %>%
  unnest(cols = where(is.list)) %>%
  unnest(cols = where(is.list)) %>%
  readr::type_convert()

希望这对你有所帮助。

英文:

The location_1 column is an empty list (hence you get NAs) with two attributes human_address which is a JSON string and a logical needs_recoding. One option to get your desired result would be to first extract the content of these attributes and store them in a list. Afterwards you could use two unnest_wider to unnest the list column.

library(xml2)
library(tidyverse)

parse_location_1 &lt;- function(x) {
  x$location_1 &lt;- list(
    human_address = jsonlite::fromJSON(attr(x$location_1, &quot;human_address&quot;)),
    needs_recoding = attr(x$location_1, &quot;needs_recoding&quot;)
  )
  x
}

fileurl &lt;- &quot;https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml&quot;
xmllist &lt;- as_list(read_xml(fileurl))
xml_df &lt;- tibble::as_tibble(xmllist) %&gt;%
  unnest_longer(response) |&gt;
  mutate(response = map(
    response, parse_location_1
  ))

row_wider &lt;- xml_df %&gt;%
  unnest_wider(response) |&gt;
  unnest_wider(location_1) |&gt; 
  unnest_wider(human_address)

row_df &lt;- row_wider %&gt;%
  unnest(cols = where(is.list)) %&gt;%
  unnest(cols = where(is.list)) %&gt;%
  readr::type_convert()
#&gt; 
#&gt; ── Column specification ────────────────────────────────────────────────────────
#&gt; cols(
#&gt;   name = col_character(),
#&gt;   zipcode = col_double(),
#&gt;   neighborhood = col_character(),
#&gt;   councildistrict = col_double(),
#&gt;   policedistrict = col_character(),
#&gt;   address = col_character(),
#&gt;   city = col_character(),
#&gt;   state = col_character(),
#&gt;   zip = col_logical(),
#&gt;   needs_recoding = col_logical(),
#&gt;   response_id = col_character()
#&gt; )

head(row_df)
#&gt; # A tibble: 6 &#215; 11
#&gt;   name   zipcode neighborhood councildistrict policedistrict address city  state
#&gt;   &lt;chr&gt;    &lt;dbl&gt; &lt;chr&gt;                  &lt;dbl&gt; &lt;chr&gt;          &lt;chr&gt;   &lt;chr&gt; &lt;chr&gt;
#&gt; 1 410      21206 Frankford                  2 NORTHEASTERN   4509 B… Balt… MD   
#&gt; 2 1919     21231 Fells Point                1 SOUTHEASTERN   1919 F… Balt… MD   
#&gt; 3 SAUTE    21224 Canton                     1 SOUTHEASTERN   2844 H… Balt… MD   
#&gt; 4 #1 CH…   21211 Hampden                   14 NORTHERN       3998 R… Balt… MD   
#&gt; 5 #1 ch…   21223 Millhill                   9 SOUTHWESTERN   2481 f… Balt… MD   
#&gt; 6 19TH …   21218 Clifton Park              14 NORTHEASTERN   2722 H… Balt… MD   
#&gt; # ℹ 3 more variables: zip &lt;lgl&gt;, needs_recoding &lt;lgl&gt;, response_id &lt;chr&gt;

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

如何在R中将XML中的字典解嵌套？

问题

答案1

答案2

如何在XPath和Python中使用preceding-sibling？它似乎显示错误的输出。

使用Plotly创建虚线网格线。

如何从SQLite数据库获取数据并在EditText中显示它

`tidyr::crossing()` 使用一个数据框列表意外地减少了行数。

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

发表评论