英文:
R Programming - Data Cleaning - DateTime
问题
你好,Stackoverflow社区,
我目前正在处理一个具有日期/时间变量和用于量化某种强度体育活动时间的数值变量的大型数据集。数据集的形式如下:
data_raw <- structure(list(`Bout Start` = c("2/8/2017 9:01:00 AM", "2/8/2017 9:23:00 AM", "2/8/2017 9:42:00 AM", "2/8/2017 11:49:00 AM", "2/8/2017 1:39:00 PM"), `Bout End` = c("2/8/2017 9:12:00 AM", "2/8/2017 9:38:00 AM", "2/8/2017 9:52:00 AM", "2/8/2017 12:05:00 PM", "2/8/2017 1:58:00 PM"), `Time in Bout` = c(11, 15, 10, 16, 19)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"))
我需要将我的数据集处理成以下形式:
data_processed <- structure(list(Date = structure(c(Date5306 = 17205, Date5307 = 17205, Date5308 = 17205, Date5309 = 17205, Date5310 = 17205), class = "Date"), Hour = structure(c(28800, 32400, 36000, 39600, 43200), class = c("hms", "difftime"), units = "secs"), `Time in Bout (Hourly)` = c(0, 36, 0, 11, 5)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"))
请问有人能帮助我完成这个转换吗?非常感谢大家的帮助!
英文:
Hello Stackoverflow community,
I am currently working with a large dataset that has Date/Time variable and a numeric variable that quantifies the time spent in physical activity of a certain intensity. The dataset is in the form:
data_raw <- structure(list(`Bout Start` = c("2/8/2017 9:01:00 AM", "2/8/2017 9:23:00 AM", "2/8/2017 9:42:00 AM", "2/8/2017 11:49:00 AM", "2/8/2017 1:39:00 PM"), `Bout End` = c("2/8/2017 9:12:00 AM", "2/8/2017 9:38:00 AM", "2/8/2017 9:52:00 AM", "2/8/2017 12:05:00 PM", "2/8/2017 1:58:00 PM"),`Time in Bout` = c(11, 15, 10, 16, 19)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"))
I require my dataset in the form:
data_processed <- structure(list(Date = structure(c(Date5306 = 17205, Date5307 = 17205, Date5308 = 17205, Date5309 = 17205, Date5310 = 17205), class = "Date"), Hour = structure(c(28800, 32400, 36000, 39600, 43200), class = c("hms", "difftime"), units = "secs"), `Time in Bout (Hourly)` = c(0, 36, 0, 11, 5)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"))
Could someone please help me do this? Thank you all in advance!
答案1
得分: 1
首先,我们需要将data_raw
中的日期时间字符串转换为实际的日期时间变量:
data <- within(data_raw, {
`Bout Start` <- as.POSIXct(`Bout Start`, format = "%m/%d/%Y %I:%M:%S %p")
`Bout End` <- as.POSIXct(`Bout End`, format = "%m/%d/%Y %I:%M:%S %p")
})
现在您的数据看起来像这样:
data
#> # A tibble: 5 x 3
#> `Bout Start` `Bout End` `Time in Bout`
#> <dttm> <dttm> <dbl>
#> 1 2017-02-08 09:01:00 2017-02-08 09:12:00 11
#> 2 2017-02-08 09:23:00 2017-02-08 09:38:00 15
#> 3 2017-02-08 09:42:00 2017-02-08 09:52:00 10
#> 4 2017-02-08 11:49:00 2017-02-08 12:05:00 16
#> 5 2017-02-08 13:39:00 2017-02-08 13:58:00 19
现在我们需要创建一个小时向量,用于检查是否有发作事件发生的小时:
times <- seq(as.POSIXct("2017-02-08 08:00"), by = "hour", len = 7)
现在棘手的部分是计算每个小时内发作事件的分钟数:
mins <- rowSums(sapply(seq(nrow(data)), function(i) {
a <- data$`Bout End`[i] - times
a <- ifelse(a > 0 & a < 60, a, 0)
b <- data$`Bout Start`[i] - times
b <- ifelse(b > 0 & b < 60, b, 0)
(a - b) %% 60
}))
最后,我们创建一个结果的数据框:
data.frame(Date = as.Date(head(times, -1)),
Hour = strftime(head(times, -1), "%H:%M:%S"),
`Time in bout` = head(mins, -1), check.names = FALSE)
#> Date Hour Time in bout
#> 1 2017-02-08 08:00:00 0
#> 2 2017-02-08 09:00:00 36
#> 3 2017-02-08 10:00:00 0
#> 4 2017-02-08 11:00:00 11
#> 5 2017-02-08 12:00:00 5
#> 6 2017-02-08 13:00:00 19
创建于2023-02-15,使用 reprex v2.0.2
英文:
First of all, we need to convert your date-time strings in data_raw
to actual date-time variables:
data <- within(data_raw, {
`Bout Start` <- as.POSIXct(`Bout Start`, format = "%m/%d/%Y %I:%M:%S %p")
`Bout End` <- as.POSIXct(`Bout End`, format = "%m/%d/%Y %I:%M:%S %p")
})
Now your data looks like this:
data
#> # A tibble: 5 x 3
#> `Bout Start` `Bout End` `Time in Bout`
#> <dttm> <dttm> <dbl>
#> 1 2017-02-08 09:01:00 2017-02-08 09:12:00 11
#> 2 2017-02-08 09:23:00 2017-02-08 09:38:00 15
#> 3 2017-02-08 09:42:00 2017-02-08 09:52:00 10
#> 4 2017-02-08 11:49:00 2017-02-08 12:05:00 16
#> 5 2017-02-08 13:39:00 2017-02-08 13:58:00 19
We now need to create a vector of hours at which you want to check for bouts:
times <- seq(as.POSIXct("2017-02-08 08:00"), by = "hour", len = 7)
The tricky part is now just counting the minutes within each of these hours when there was a bout taking place:
mins <- rowSums(sapply(seq(nrow(data)), function(i) {
a <- data$`Bout End`[i] - times
a <- ifelse(a > 0 & a < 60, a, 0)
b <- data$`Bout Start`[i] - times
b <- ifelse(b > 0 & b < 60, b, 0)
(a - b) %% 60
}))
Finally, we create a data frame of the results:
data.frame(Date = as.Date(head(times, -1)),
Hour = strftime(head(times, -1), "%H:%M:%S"),
`Time in bout` = head(mins, -1), check.names = FALSE)
#> Date Hour Time in bout
#> 1 2017-02-08 08:00:00 0
#> 2 2017-02-08 09:00:00 36
#> 3 2017-02-08 10:00:00 0
#> 4 2017-02-08 11:00:00 11
#> 5 2017-02-08 12:00:00 5
#> 6 2017-02-08 13:00:00 19
<sup>Created on 2023-02-15 with reprex v2.0.2</sup>
答案2
得分: 0
以下是翻译好的内容:
- 获取日期/小时序列以填充缺失数据的
dd1
- 将跨越小时的时间拆分为相应的小时段,
dd2
- 连接
dd1
和dd2
- 同时,将字符串动态转换为日期和完整的小时
library(dplyr) # >= v1.1.0 for ".by" in full_join's summarize and consecutive_id
library(tidyr) # separate and replace_na
library(lubridate) # date functions
dd1 <- tibble(ID = seq(
ymd_hms(format(first(mdy_hms(data_raw$`Bout Start`)),
"%Y-%m-%d %H:00:00")),
ymd_hms(format(last(mdy_hms(data_raw$`Bout Start`)),
"%Y-%m-%d %H:00:00")), 3600))
dd1
# A tibble: 5 × 1
ID
<dttm>
1 2017-02-08 09:00:00
2 2017-02-08 10:00:00
3 2017-02-08 11:00:00
4 2017-02-08 12:00:00
5 2017-02-08 13:00:00
dd2 <- data_raw %>%
mutate(`Bout Start` = mdy_hms(`Bout Start`),
`Bout End` = mdy_hms(`Bout End`),
is = format(`Bout Start`, "%H") != format(`Bout End`, "%H")) %>%
uncount(is + 1) %>%
group_by(grp = consecutive_id(is)) %>%
mutate(`Bout Start` = if_else(is & row_number() == 2,
ymd_hms(format(first(`Bout End`), "%Y-%m-d %H:00:00")), `Bout Start`),
`Bout End` = if_else(is & row_number() == 1,
ymd_hms(format(first(`Bout End`), "%Y-%m-%d %H:00:00")), `Bout End`),
`Time in Bout` = `Bout End` - `Bout Start`,
ID = ymd_hms(format(`Bout Start`, "%Y-%m-d %H:00:00")), is = NULL) %>%
ungroup() %>%
select(-grp)
dd2
# A tibble: 6 × 4
`Bout Start` `Bout End` `Time in Bout` ID
<dttm> <dttm> <drtn> <dttm>
1 2017-02-08 09:01:00 2017-02-08 09:12:00 11 mins 2017-02-08 09:00:00
2 2017-02-08 09:23:00 2017-02-08 09:38:00 15 mins 2017-02-08 09:00:00
3 2017-02-08 09:42:00 2017-02-08 09:52:00 10 mins 2017-02-08 09:00:00
4 2017-02-08 11:49:00 2017-02-08 12:00:00 11 mins 2017-02-08 11:00:00
5 2017-02-08 12:00:00 2017-02-08 12:05:00 5 mins 2017-02-08 12:00:00
6 2017-02-08 13:39:00 2017-02-08 13:58:00 19 mins 2017-02-08 13:00:00
连接dd1
和dd2
,还分离Date
和Hour
,并用0
替换缺失的日期/小时。
full_join(dd1, dd2, multiple="all") %>%
mutate(`Time in Bout` = replace_na(`Time in Bout`, duration(0))) %>%
summarize(`Time in Bout (Hourly)` = sum(`Time in Bout`), .by = ID) %>%
separate(ID, c("Date", "Hour"), sep=" ")
Joining with `by = join_by(ID)`
# A tibble: 5 × 3
Date Hour `Time in Bout (Hourly)`
<chr> <chr> <drtn>
1 2017-02-08 09:00:00 36 mins
2 2017-02-08 10:00:00 0 mins
3 2017-02-08 11:00:00 11 mins
4 2017-02-08 12:00:00 5 mins
5 2017-02-08 13:00:00 19 mins
英文:
A rather complex task, here is a tidyverse
approach
- get the sequence of dates/hours to enable filling of missing data,
dd1
- split hour-spanning times into their corresponding hour bin,
dd2
- join
dd1
anddd2
- also, on the fly, convert strings to dates and hours/full hours
Note, it's a dynamic approach; starting and ending hours/dates show up once they appear in the raw data.
library(dplyr) # >= v1.1.0 for ".by" in full_join's summarize and consecutive_id
library(tidyr) # separate and replace_na
library(lubridate) # date functions
dd1 <- tibble(ID = seq(
ymd_hms(format(first(mdy_hms(data_raw$`Bout Start`)),
"%Y-%m-%d %H:00:00")),
ymd_hms(format(last(mdy_hms(data_raw$`Bout Start`)),
"%Y-%m-%d %H:00:00")), 3600))
dd1
# A tibble: 5 × 1
ID
<dttm>
1 2017-02-08 09:00:00
2 2017-02-08 10:00:00
3 2017-02-08 11:00:00
4 2017-02-08 12:00:00
5 2017-02-08 13:00:00
dd2 <- data_raw %>%
mutate(`Bout Start` = mdy_hms(`Bout Start`),
`Bout End` = mdy_hms(`Bout End`),
is = format(`Bout Start`, "%H") != format(`Bout End`, "%H")) %>%
uncount(is + 1) %>%
group_by(grp = consecutive_id(is)) %>%
mutate(`Bout Start` = if_else(is & row_number() == 2,
ymd_hms(format(first(`Bout End`), "%Y-%m-%d %H:00:00")), `Bout Start`),
`Bout End` = if_else(is & row_number() == 1,
ymd_hms(format(first(`Bout End`), "%Y-%m-%d %H:00:00")), `Bout End`),
`Time in Bout` = `Bout End` - `Bout Start`,
ID = ymd_hms(format(`Bout Start`, "%Y-%m-%d %H:00:00")), is = NULL) %>%
ungroup() %>%
select(-grp)
dd2
# A tibble: 6 × 4
`Bout Start` `Bout End` `Time in Bout` ID
<dttm> <dttm> <drtn> <dttm>
1 2017-02-08 09:01:00 2017-02-08 09:12:00 11 mins 2017-02-08 09:00:00
2 2017-02-08 09:23:00 2017-02-08 09:38:00 15 mins 2017-02-08 09:00:00
3 2017-02-08 09:42:00 2017-02-08 09:52:00 10 mins 2017-02-08 09:00:00
4 2017-02-08 11:49:00 2017-02-08 12:00:00 11 mins 2017-02-08 11:00:00
5 2017-02-08 12:00:00 2017-02-08 12:05:00 5 mins 2017-02-08 12:00:00
6 2017-02-08 13:39:00 2017-02-08 13:58:00 19 mins 2017-02-08 13:00:00
Joining dd1
and dd2
, also separating Date
and Hour
and replace NA
from missing dates/hours with 0
.
full_join(dd1, dd2, multiple="all") %>%
mutate(`Time in Bout` = replace_na(`Time in Bout`, duration(0))) %>%
summarize(`Time in Bout (Hourly)` = sum(`Time in Bout`), .by = ID) %>%
separate(ID, c("Date", "Hour"), sep=" ")
Joining with `by = join_by(ID)`
# A tibble: 5 × 3
Date Hour `Time in Bout (Hourly)`
<chr> <chr> <drtn>
1 2017-02-08 09:00:00 36 mins
2 2017-02-08 10:00:00 0 mins
3 2017-02-08 11:00:00 11 mins
4 2017-02-08 12:00:00 5 mins
5 2017-02-08 13:00:00 19 mins
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论