英文:
Object of type 'closure' is not subsettable when training a geographically weighted random forest model
问题
我使用R
包SpatialML
运行了一个地理加权随机森林回归模型。我将数据集分为训练集和测试集。当我尝试使用函数grf.bw
计算训练集的算法带宽时,出现了以下错误:
Error in dframe[DepVarName] : object of type 'closure' is not subsettable
如果我使用整个数据集,函数会正常运行而不出错。我发现其他帖子中出现了相同的错误消息,但它们大多集中在Shiny应用程序上。为什么在训练集上训练模型时会出现这个错误消息?
以下是您提供的代码的翻译部分:
# 加载所需的库
library(GWmodel)
library(sp)
library(raster)
library(SpatialML)
library(terra)
library(caret)
library(doParallel)
wd = "path/"
# 读取数据
block.data = read.csv(paste0(wd, "block.data.csv"))
Coords <- block.data[, 1:2]
# 将lc列(分类变量)转换为哑变量
df_dummies <- model.matrix(~factor(lc) - 1, data = block.data)
# 将哑变量与原始数据框合并
block.data <- cbind(block.data, df_dummies)
# 删除原始lc变量
block.data$lc <- NULL
colnames(block.data)[7:9] = c("lc30", "lc50", "lc60")
eq1 = ntl ~ pop + tirs + agbh + lc30 + lc50 + lc60
set.seed(1123)
samp <- sample(nrow(block.data), 0.80 * nrow(block.data))
train <- block.data[samp, ]
Coords_train <- train[, 1:2]
test <- block.data[-samp, ]
Coords_test <- test[, 1:2]
no_cores <- detectCores() - 1
cl = makePSOCKcluster(no_cores)
registerDoParallel(cl)
# 定义控制参数
trControl = trainControl(method = "repeatedcv",
number = 3,
search = "grid",
savePredictions = FALSE)
rf_default = train(eq1,
data = train,
method = "rf",
metric = "Rsquared",
trControl = trControl)
print(rf_default)
# 搜索最佳mtry
set.seed(1234444)
tuneGrid <- expand.grid(.mtry = c(2:6))
rf_mtry <- train(eq1,
data = train,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 10,
ntree = 500)
best_mtry <- rf_mtry$bestTune$mtry
# 搜索最佳maxnodes
tuneGrid = expand.grid(.mtry = best_mtry)
best.rsq <- -1
best.maxnodes <- 0
for (maxnodes in c(5:15)){
set.seed(3455556)
rf_maxnode = train(eq1,
data = train,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 10,
maxnodes = maxnodes,
ntree = 500)
rsq <- rf_maxnode$finalModel$rsq
if (rsq > best.rsq) {
best.rsq <- rsq
best.maxnodes <- maxnodes
}
}
# 搜索最佳ntree
best.ntree <- -1
best.rsq <- -1
for (ntree in seq(from = 500, to = 2000, by = 500)) {
set.seed(67777789)
rf_maxtrees = train(eq1,
data = train,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 10,
maxnodes = best.maxnodes,
ntree = ntree)
rsq <- rf_maxtrees$finalModel$rsq
if (rsq > best.rsq) {
best.rsq <- rsq
best.ntree <- ntree
}
}
abw = grf.bw(eq1,
train,
kernel = "adaptive",
coords = Coords_train,
bw.min = 10,
bw.max = 11,
step = 1,
trees = best.ntree,
mtry = best_mtry,
importance = "impurity",
nthreads = 3,
forests = FALSE,
weighted = FALSE)
abw$Best.BW
# 小样本数据集
block.data = structure(list(x = c(4517800L, 4517800L, 4517800L, 4517800L,
4517800L, 4517800L, 4517800L, 4517800L, 4517800L, 4517800L, 4517800L,
4517800L, 4517800L, 4517800L, 4517800L, 4517800L, 4517800L, 4517800L,
4517800L, 4517800L), y = c(4308200L, 4307800L, 4307400L, 4307000L,
4306600L, 4306200L, 4305800L, 4305400L, 4305000L, 4304600L, 4304200L,
4303800L, 4303400L, 4303000L, 4302600L, 4302200L, 4301800L, 4301400L,
4301000L, 4300600L), ntl = c(58.661979675293, 58.436840057373,
55.8877563476562, 51.9307441711426, 52.9197845458984, 58.1713027954102,
59.4091491699219, 58.0057525634766, 62.3850288391113, 43.693775177002,
32.7657432556152, 38.814208984375, 45.2988662719727, 30.7070808410645,
12.8714799880981, 12.8714799880981, 13.521935462951
<details>
<summary>英文:</summary>
I run a geographically weighted random forest regression model using the `R` package `SpatialML`. I separated the data set into train and test sets. When I try to calculate the bandwidth for the algorithm for the train set using the function `grf.bw`, I get this error:
Error in dframe[DepVarName] : object of type 'closure' is not subsettable
If I use the whole data set, the function run withouts an issue. I have found other posts with the same error message but they, mostly, focusing on shiny apps. Why this error message when I train the model using the train set?
library(GWmodel)
library(sp)
library(raster)
library(SpatialML)
library(terra)
library(caret)
library(doParallel)
wd = "path/"
block.data = read.csv(paste0(wd, "block.data.csv"))
Coords<-block.data[ ,1:2]
# convert lc column (categorical variable) to dummy variables
df_dummies <- model.matrix(~ factor(lc) - 1, data = block.data)
# merge the dummy variables with the original data frame
block.data <- cbind(block.data, df_dummies)
# remove the original lc variable
block.data$lc <- NULL
colnames(block.data)[7:9] = c("lc30", "lc50", "lc60")
eq1 = ntl ~ pop + tirs + agbh + lc30 + lc50 + lc60
set.seed(1123)
samp <- sample(nrow(block.data), 0.80 * nrow(block.data))
train <- block.data[samp, ]
Coords_train <- train[ ,1:2]
test <- block.data[-samp, ]
Coords_test <- test[ ,1:2]
no_cores <- detectCores() - 1
cl = makePSOCKcluster(no_cores)
registerDoParallel(cl)
# define the control
trControl = trainControl(method = "repeatedcv",
number = 3,
search = "grid",
savePredictions = FALSE)
rf_default = train(eq1,
data = train,
method = "rf",
metric = "Rsquared",
trControl = trControl)
print(rf_default)
# Search best mtry
set.seed(1234444)
tuneGrid <- expand.grid(.mtry = c(2:6))
rf_mtry <- train(eq1,
data = train,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 10,
ntree = 500)
best_mtry <- rf_mtry$bestTune$mtry
# search best maxnodes
tuneGrid = expand.grid(.mtry = best_mtry)
best.rsq <- -1
best.maxnodes <- 0
for (maxnodes in c(5:15)){
set.seed(3455556)
rf_maxnode = train(eq1,
data = train,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 10,
maxnodes = maxnodes,
# tuneLenght = 50,
ntree = 500)
rsq <- rf_maxnode$finalModel$rsq
if (rsq > best.rsq) {
best.rsq <- rsq
best.maxnodes <- maxnodes
}
}
# search best ntree
best.ntree <- -1
best.rsq <- -1
for (ntree in seq(from = 500, to = 2000, by = 500)) {
set.seed(67777789)
rf_maxtrees = train(eq1,
data = train,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 10,
maxnodes = best.maxnodes,
ntree = ntree)
rsq <- rf_maxtrees$finalModel$rsq
if (rsq > best.rsq) {
best.rsq <- rsq
best.ntree <- ntree
}
}
abw = grf.bw(eq1,
train,
kernel = "adaptive",
coords = Coords_train,
bw.min = 10,
bw.max = 11,
step = 1,
trees = best.ntree,
mtry = best_mtry,
importance = "impurity",
nthreads = 3,
forests = FALSE,
weighted = FALSE)
abw$Best.BW
A small sample data set:
block.data = structure(list(x = c(4517800L, 4517800L, 4517800L, 4517800L,
4517800L, 4517800L, 4517800L, 4517800L, 4517800L, 4517800L, 4517800L,
4517800L, 4517800L, 4517800L, 4517800L, 4517800L, 4517800L, 4517800L,
4517800L, 4517800L), y = c(4308200L, 4307800L, 4307400L, 4307000L,
4306600L, 4306200L, 4305800L, 4305400L, 4305000L, 4304600L, 4304200L,
4303800L, 4303400L, 4303000L, 4302600L, 4302200L, 4301800L, 4301400L,
4301000L, 4300600L), ntl = c(58.661979675293, 58.436840057373,
55.8877563476562, 51.9307441711426, 52.9197845458984, 58.1713027954102,
59.4091491699219, 58.0057525634766, 62.3850288391113, 43.693775177002,
32.7657432556152, 38.814208984375, 45.2988662719727, 30.7070808410645,
12.8714799880981, 12.8714799880981, 13.5219354629517, 23.7446193695068,
37.4298362731934, 46.3565788269043), pop = c(75.9549713134766,
77.1868209838867, 77.5336608886719, 79.3608703613281, 82.8243789672852,
86.7496871948242, 90.5956954956055, 92.8819274902344, 92.3746337890625,
87.0225830078125, 80.3043441772461, 84.9410400390625, 85.2942810058594,
83.6485595703125, 61.1035919189453, 40.2169418334961, 53.4071769714355,
76.6943511962891, 80.3962097167969, 80.990608215332), tirs = c(29.3157043457031,
29.6056385040283, 29.3482208251953, 29.9990634918213, 30.0760860443115,
30.7479610443115, 32.0067825317383, 32.5420265197754, 33.4829483032227,
34.0894584655762, 34.0738372802734, 33.5429801940918, 34.0574531555176,
35.3299293518066, 34.9521026611328, 35.1631278991699, 35.4775505065918,
34.7371597290039, 33.3271560668945, 34.1149940490723), agbh = c(4.38038444519043,
5.61569929122925, 6.89490604400635, 6.30956649780273, 6.12616586685181,
6.96545696258545, 6.16812896728516, 1.98371779918671, 1.08961737155914,
0.348455667495728, 0.0777057632803917, 0.0572107657790184, 0.53607851266861,
0.362500905990601, 0.0665311068296432, 0.138985157012939, 0.217744708061218,
0.737295210361481, 3.04893350601196, 1.89484882354736), lc = c(50L,
50L, 50L, 50L, 50L, 50L, 50L, 50L, 30L, 60L, 30L, 60L, 50L, 60L,
30L, 30L, 30L, 50L, 50L, 60L)), class = "data.frame", row.names = c(NA,
-20L))
</details>
# 答案1
**得分**: 1
如@MrFlick和@Roland所述,通过将名为*train*的`data.frame`更改为*df_train*,问题得以解决。
完整的代码:
```R
wd = "path/"
block.data = read.csv(paste0(wd, "block.data.csv"))
Coords<-block.data[ ,1:2]
# 将x3转换为虚拟变量
df_dummies <- model.matrix(~ factor(lc) - 1, data = block.data)
# 将虚拟变量与原始数据框合并
block.data <- cbind(block.data, df_dummies)
# 删除原始的x3变量
block.data$lc <- NULL
colnames(block.data)[7:9] = c("lc30", "lc50", "lc60")
eq1 = ntl ~ pop + tirs + agbh + lc30 + lc50 + lc60
set.seed(1123)
samp <- sample(nrow(block.data), 0.80 * nrow(block.data))
df_train <- block.data[samp, ]
Coords_train <- df_train[ ,1:2]
test <- block.data[-samp, ]
Coords_test <- test[ ,1:2]
no_cores <- detectCores() - 1
cl = makePSOCKcluster(no_cores)
registerDoParallel(cl)
# 定义控制参数
trControl = trainControl(method = "repeatedcv",
number = 3,
search = "grid",
savePredictions = FALSE)
rf_default = train(eq1,
data = df_train,
method = "rf",
metric = "Rsquared",
trControl = trControl)
print(rf_default)
# 寻找最佳的mtry
set.seed(1234444)
tuneGrid <- expand.grid(.mtry = c(2:6))
rf_mtry <- train(eq1,
data = df_train,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 10,
ntree = 500)
best_mtry <- rf_mtry$bestTune$mtry
# 寻找最佳的maxnodes
tuneGrid = expand.grid(.mtry = best_mtry)
best.rsq <- -1
best.maxnodes <- 0
for (maxnodes in c(5:15)){
set.seed(3455556)
rf_maxnode = train(eq1,
data = df_train,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 10,
maxnodes = maxnodes,
ntree = 500)
rsq <- rf_maxnode$finalModel$rsq
if (rsq > best.rsq) {
best.rsq <- rsq
best.maxnodes <- maxnodes
}
}
# 寻找最佳的ntree
best.ntree <- -1
best.rsq <- -1
for (ntree in seq(from = 500, to = 2000, by = 500)) {
set.seed(67777789)
rf_maxtrees = train(eq1,
data = df_train,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 10,
maxnodes = best.maxnodes,
ntree = ntree)
rsq <- rf_maxtrees$finalModel$rsq
if (rsq > best.rsq) {
best.rsq <- rsq
best.ntree <- ntree
}
}
abw = grf.bw(eq1,
df_train,
kernel = "adaptive",
coords = Coords_train,
bw.min = 10,
bw.max = 11,
step = 1,
trees = best.ntree,
mtry = best_mtry,
importance = "impurity",
nthreads = 3,
forests = FALSE,
weighted = FALSE)
abw$Best.BW
# 用最佳的随机森林训练模型
grf.model <- grf(eq1,
dframe = df_train,
bw = abw$Best.BW,
ntree = best.ntree,
mtry = best_mtry,
kernel = "adaptive",
forests = TRUE,
coords = Coords_train,
nthreads = 3)
# 在测试数据中检查Rsquared
grf.model2 <- grf(eq1,
dframe = train,
bw = abw$Best.BW,
ntree = best.ntree,
mtry = best_mtry,
kernel = "adaptive",
forests = TRUE,
coords = Coords,
nthreads = 3)
grf.model2
stopCluster(cl)
英文:
As stated by @MrFlick and @Roland, by changing the data.frame
called train to df_train the problem is solved.
The complete code:
wd = "path/"
block.data = read.csv(paste0(wd, "block.data.csv"))
Coords<-block.data[ ,1:2]
# convert x3 to dummy variables
df_dummies <- model.matrix(~ factor(lc) - 1, data = block.data)
# merge the dummy variables with the original data frame
block.data <- cbind(block.data, df_dummies)
# remove the original x3 variable
block.data$lc <- NULL
colnames(block.data)[7:9] = c("lc30", "lc50", "lc60")
eq1 = ntl ~ pop + tirs + agbh + lc30 + lc50 + lc60
set.seed(1123)
samp <- sample(nrow(block.data), 0.80 * nrow(block.data))
df_train <- block.data[samp, ]
Coords_train <- df_train[ ,1:2]
test <- block.data[-samp, ]
Coords_test <- test[ ,1:2]
no_cores <- detectCores() - 1
cl = makePSOCKcluster(no_cores)
registerDoParallel(cl)
# define the control
trControl = trainControl(method = "repeatedcv",
number = 3,
search = "grid",
savePredictions = FALSE)
rf_default = train(eq1,
data = df_train,
method = "rf",
metric = "Rsquared",
trControl = trControl)
print(rf_default)
# Search best mtry
set.seed(1234444)
tuneGrid <- expand.grid(.mtry = c(2:6))
rf_mtry <- train(eq1,
data = df_train,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 10,
ntree = 500)
best_mtry <- rf_mtry$bestTune$mtry
# search best maxnodes
tuneGrid = expand.grid(.mtry = best_mtry)
best.rsq <- -1
best.maxnodes <- 0
for (maxnodes in c(5:15)){
set.seed(3455556)
rf_maxnode = train(eq1,
data = df_train,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 10,
maxnodes = maxnodes,
# tuneLenght = 50,
ntree = 500)
rsq <- rf_maxnode$finalModel$rsq
if (rsq > best.rsq) {
best.rsq <- rsq
best.maxnodes <- maxnodes
}
}
# search best ntree
best.ntree <- -1
best.rsq <- -1
for (ntree in seq(from = 500, to = 2000, by = 500)) {
set.seed(67777789)
rf_maxtrees = train(eq1,
data = df_train,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 10,
maxnodes = best.maxnodes,
ntree = ntree)
rsq <- rf_maxtrees$finalModel$rsq
if (rsq > best.rsq) {
best.rsq <- rsq
best.ntree <- ntree
}
}
abw = grf.bw(eq1,
df_train,
kernel = "adaptive",
coords = Coords_train,
bw.min = 10,
bw.max = 11,
step = 1,
trees = best.ntree,
mtry = best_mtry,
importance = "impurity",
nthreads = 3,
forests = FALSE,
weighted = FALSE)
abw$Best.BW
# train the model with the optimum rf
grf.model <- grf(eq1,
dframe = train,
bw = abw$Best.BW,
ntree = best.ntree,
mtry = best_mtry,
kernel = "adaptive",
forests = TRUE,
coords = Coords_train,
nthreads = 3)
# check Rsquared in the test data
grf.model2 <- grf(eq1,
dframe = train,
bw = abw$Best.BW,
ntree = best.ntree,
mtry = best_mtry,
kernel = "adaptive",
forests = TRUE,
coords = Coords,
nthreads = 3)
grf.model2
stopCluster(cl)
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论