英文:
How to find the maximum value for given range in a big data set?
问题
我有一个名为Jon的大型数据集,看起来像这样:
我还有另一个文件,我将其命名为"test",其中包含我想在Jon文件中查看的起始和结束坐标。
根据这些信息,您可以帮助我编写一段代码,以便为测试文件中的每行坐标提供最大的lgmeandiff信息。例如:test[c(3),]
,坐标位于358-366之间。因此,我想在Jon文件中查看这些坐标值,并选择在该范围内具有最大lgmeandiff的行,并将该信息写入输出文件中,对于每个坐标。
for(i in 1:nrow(test)){
listofcats <- seq.int(test$from[i], test$to[i])
}
tempframe <- c()
outputframe <- as.data.frame(matrix(nrow = 0, ncol = ncol(Jon)))
for(i in 1:nrow(test)){
for(k in 1:nrow(Jon)){
tempframe <- seq.int(test$from[i], test$to[i])
tempMax <- Jon[c(max(abs(Jon$lgmeandiff[k] %in% tempframe))),]
outputframe <- rbind(tempframe,outputframe)
}
}
}
编辑:
数据太大了,所以我包含了每个数据集的dput(head()):
对于Jon数据集:
structure(list(TTS = c(291L, 322L, 358L, 360L, 361L, 363L), lgmeandiff = c(-922337203685478,
-922337203685478, -0.0954, -0.9885, -1.4739, -3.7959), lgtotdiff = c(-922337203685478,
-922337203685478, 3.2265, 2.3334, 1.848, -0.4739), meandiff = c(0,
0, 0.936, 0.504, 0.36, 0.072), totdiff = c(0, 0, 9.36, 5.04,
3.6, 0.72), premean = c(0, 0.720000028610229, 1.15200004577637,
1.44000005722046, 1.58400001525879, 1.87200012207031), precnt = c(0,
7.20000028610229, 11.5200004577637, 14.4000005722046, 15.840000629425,
18.720000743866), sufmean = c(0, 0.720000028610229, 2.08800010681152,
1.94400005340576, 1.94400005340576, 1.94400005340576), sufcnt = c(0,
7.20000028610229, 20.8800008296967, 19.4400007724762, 19.4400007724762,
19.4400007724762)), row.names = c(NA, 6L), class = "data.frame")
对于测试数据集:
structure(list(from = c(291, 322, 358, 371, 392, 395), to = c(291L,
322L, 366L, 382L, 392L, 401L)), row.names = c(1L, 2L, 3L, 7L,
10L, 11L), class = "data.frame")
英文:
I have a big dataset I named Jon that looks like this:
and i have another file that i named "test" that has the start and end coordinates that I want to look at in the Jon file.
Based on this information, can you help me write a code that will give me the biggest lgmeandiff information for each of the coordinates that lies in the test file for each row.
For example: test[c(3),]
, the coordinates are between 358-366. SO I want to look at these coordinates values in the Jon file and choose the row that has the biggest lgmeandiff for that range and write that information to an output file for each of coordinates.
for(i in 1:nrow(test)){
listofcats <- seq.int(test$from[i], test$to[i])
}
tempframe <- c()
outputframe <- as.data.frame(matrix(nrow = 0, ncol = ncol(Jon)))
for(i in 1:nrow(test)){
for(k in 1:nrow(Jon)){
tempframe <- seq.int(test$from[i], test$to[i])
tempMax <- Jon[c(max(abs(Jon$lgmeandiff[k] %in% tempframe))),]
outputframe <- rbind(tempframe,outputframe)
}
}
}
Edit:
Data are too large, so I am including the dput(head()) for each data set:
for Jon dataset;
structure(list(TTS = c(291L, 322L, 358L, 360L, 361L, 363L), lgmeandiff = c(-922337203685478,
-922337203685478, -0.0954, -0.9885, -1.4739, -3.7959), lgtotdiff = c(-922337203685478,
-922337203685478, 3.2265, 2.3334, 1.848, -0.4739), meandiff = c(0,
0, 0.936, 0.504, 0.36, 0.072), totdiff = c(0, 0, 9.36, 5.04,
3.6, 0.72), premean = c(0, 0.720000028610229, 1.15200004577637,
1.44000005722046, 1.58400001525879, 1.87200012207031), precnt = c(0,
7.20000028610229, 11.5200004577637, 14.4000005722046, 15.840000629425,
18.720000743866), sufmean = c(0, 0.720000028610229, 2.08800010681152,
1.94400005340576, 1.94400005340576, 1.94400005340576), sufcnt = c(0,
7.20000028610229, 20.8800008296967, 19.4400007724762, 19.4400007724762,
19.4400007724762)), row.names = c(NA, 6L), class = "data.frame")
for test dataset
structure(list(from = c(291, 322, 358, 371, 392, 395), to = c(291L,
322L, 366L, 382L, 392L, 401L)), row.names = c(1L, 2L, 3L, 7L,
10L, 11L), class = "data.frame")
答案1
得分: 4
join_by
在 dplyr
1.1.0 中添加,用于创建连接规范。在您的情况下,您希望在 TSS
位于 from
和 to
之间(包括边界)时进行合并:
library(dplyr)
full_join(jon, test, by = join_by(between(TTS, from, to))) %>%
slice_max(lgmeandiff, by = c(from, to)) %>%
select(from, to, lgmeandiff)
输出
from to lgmeandiff
1 291 291 -9.223372e+14
2 322 322 -9.223372e+14
3 358 366 -9.540000e-02
4 371 382 NA
5 392 392 NA
6 395 401 NA
以下是将输出写入Excel文件的一种选项,其中每个坐标对将成为工作簿中的一个工作表:
library(dplyr)
full_join(jon, test, by = join_by(between(TTS, from, to))) %>%
slice_max(lgmeandiff, by = c(from, to)) %>%
mutate(coord = paste(from, "-", to),
lgmeandiff = lgmeandiff, .keep = "none") %>%
split(~ coord) %>%
writexl::write_xlsx("output.xlsx")
注意:Excel 有一个工作表数量的限制,最多为 255 个,所以如果您有超过 255 个唯一的坐标对(听起来您可能会有更多),那么您可以考虑其他输出类型。
英文:
join_by
was added in dplyr
1.1.0 to create a join specification. In your case you want to merge if TSS
falls between
from
and to
(inclusive):
library(dplyr)
full_join(jon, test, by = join_by(between(TTS, from, to))) |>
slice_max(lgmeandiff, by = c(from, to)) |>
select(from, to, lgmeandiff)
Output
from to lgmeandiff
1 291 291 -9.223372e+14
2 322 322 -9.223372e+14
3 358 366 -9.540000e-02
4 371 382 NA
5 392 392 NA
6 395 401 NA
Here is one option to write the output to an Excel file, where each coordinate pair will be a sheet in the workbook:
library(dplyr)
full_join(jon, test, by = join_by(between(TTS, from, to))) |>
slice_max(lgmeandiff, by = c(from, to)) |>
mutate(coord = paste(from, "-", to),
lgmeandiff = lgmeandiff, .keep = "none") |>
split(~ coord) |>
writexl::write_xlsx("output.xlsx")
Note: Excel has a limit on the number of sheets. It is 255, so if you have more than 255 unique pairs (which is sounds like you might) then you might consider an alternate output type.
答案2
得分: 3
A tidyverse solution:
library(dplyr)
library(purrr)
my_func <- function(df, min, max) {
df %>%
filter(TTS >= min, TTS <= max) %>%
pull(lgmeandiff) %>%
max()
}
mutate(test, lgmeandiff = map2_dbl(from, to, ~ my_func(Jon, .x, .y)))
from to lgmeandiff
1 291 291 -922337203685478.0000
2 322 322 -922337203685478.0000
3 358 366 -0.0954
7 371 382 -Inf
10 392 392 -Inf
11 395 401 -Inf
The `-Inf` is only occurring because those specific ranges are not available in the rows of `Jon` provided.
@jay.sf's answer is likely a faster one to execute than mine, which should be of preference to you given the size of your dataset.
<details>
<summary>英文:</summary>
A tidyverse solution:
library(dplyr)
library(purrr)
my_func <- function(df, min, max) {
df |>
filter(TTS >= min, TTS <= max) |>
pull(lgmeandiff) |>
max()
}
mutate(test, lgmeandiff = map2_dbl(from, to, ~ my_func(Jon, .x, .y)))
from to lgmeandiff
1 291 291 -922337203685478.0000
2 322 322 -922337203685478.0000
3 358 366 -0.0954
7 371 382 -Inf
10 392 392 -Inf
11 395 401 -Inf
The `-Inf` is only occurring because those specific ranges are not available in the rows of `Jon` provided.
@jay.sf's answer is likely a faster one to execute than mine, which should be of preference to you given the size of your dataset.
</details>
# 答案3
**得分**: 2
在`Map`函数中使用`seq.int`。
```R
with(test, Map(seq.int, from, to)) |
{
sapply(., \(i) max(jon[jon$TTS %in% i, 'lgmeandiff']))
}() |
setNames(apply(test, 1, toString))
# 291, 291 322, 322 358, 366 371, 382 392, 392 395, 401
# -9.223372e+14 -9.223372e+14 -9.540000e-02 -Inf -Inf -Inf
# Warning messages:
# 1: In max(jon[jon$TTS %in% i, 'lgmeandiff']) :
# no non-missing arguments to max; returning -Inf
# 2: In max(jon[jon$TTS %in% i, 'lgmeandiff']) :
# no non-missing arguments to max; returning -Inf
# 3: In max(jon[jon$TTS %in% i, 'lgmeandiff']) :
# no non-missing arguments to max; returning -Inf
更详细的版本,不包含警告:
with(test, Map(seq.int, from, to)) |
{
sapply(., \(i) {
v <- jon[jon$TTS %in% i, 'lgmeandiff']
if (all(is.na(v))) {
NA_real_
} else {
max(jon[jon$TTS %in% i, 'lgmeandiff'])
}
})
}() |
setNames(apply(test, 1, toString))
# 291, 291 322, 322 358, 366 371, 382 392, 392 395, 401
# -9.223372e+14 -9.223372e+14 -9.540000e-02 NA NA NA
你也可以考虑使用max(., na.rm=TRUE)
。
英文:
Use seq.int
in Map
.
with(test, Map(seq.int, from, to)) |>
{\(.) sapply(., \(i) max(jon[jon$TTS %in% i, 'lgmeandiff']))}() |>
setNames(apply(test, 1, toString))
# 291, 291 322, 322 358, 366 371, 382 392, 392 395, 401
# -9.223372e+14 -9.223372e+14 -9.540000e-02 -Inf -Inf -Inf
# Warning messages:
# 1: In max(jon[jon$TTS %in% i, "lgmeandiff"]) :
# no non-missing arguments to max; returning -Inf
# 2: In max(jon[jon$TTS %in% i, "lgmeandiff"]) :
# no non-missing arguments to max; returning -Inf
# 3: In max(jon[jon$TTS %in% i, "lgmeandiff"]) :
# no non-missing arguments to max; returning -Inf
More verbose version without warnings:
with(test, Map(seq.int, from, to)) |>
{\(.) sapply(., \(i) {
v <- jon[jon$TTS %in% i, 'lgmeandiff']
if (all(is.na(v))) {
NA_real_
} else {
max(jon[jon$TTS %in% i, 'lgmeandiff'])
}
})}() |>
setNames(apply(test, 1, toString))
# 291, 291 322, 322 358, 366 371, 382 392, 392 395, 401
# -9.223372e+14 -9.223372e+14 -9.540000e-02 NA NA NA
You could also consider max(., na.rm=TRUE)
.
Data:
jon <- structure(list(TTS = c(291L, 322L, 358L, 360L, 361L, 363L), lgmeandiff = c(-922337203685478,
-922337203685478, -0.0954, -0.9885, -1.4739, -3.7959), lgtotdiff = c(-922337203685478,
-922337203685478, 3.2265, 2.3334, 1.848, -0.4739), meandiff = c(0,
0, 0.936, 0.504, 0.36, 0.072), totdiff = c(0, 0, 9.36, 5.04,
3.6, 0.72), premean = c(0, 0.720000028610229, 1.15200004577637,
1.44000005722046, 1.58400001525879, 1.87200012207031), precnt = c(0,
7.20000028610229, 11.5200004577637, 14.4000005722046, 15.840000629425,
18.720000743866), sufmean = c(0, 0.720000028610229, 2.08800010681152,
1.94400005340576, 1.94400005340576, 1.94400005340576), sufcnt = c(0,
7.20000028610229, 20.8800008296967, 19.4400007724762, 19.4400007724762,
19.4400007724762)), row.names = c(NA, 6L), class = "data.frame")
test <- structure(list(from = c(291, 322, 358, 371, 392, 395), to = c(291L,
322L, 366L, 382L, 392L, 401L)), row.names = c(1L, 2L, 3L, 7L,
10L, 11L), class = "data.frame")
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论