Object of type 'closure' is not subsettable when training a geographically weighted random forest model

huangapple go评论63阅读模式
英文:

Object of type 'closure' is not subsettable when training a geographically weighted random forest model

问题

我使用RSpatialML运行了一个地理加权随机森林回归模型。我将数据集分为训练集和测试集。当我尝试使用函数grf.bw计算训练集的算法带宽时,出现了以下错误:

Error in dframe[DepVarName] : object of type 'closure' is not subsettable

如果我使用整个数据集,函数会正常运行而不出错。我发现其他帖子中出现了相同的错误消息,但它们大多集中在Shiny应用程序上。为什么在训练集上训练模型时会出现这个错误消息?

以下是您提供的代码的翻译部分:

# 加载所需的库
library(GWmodel)
library(sp)
library(raster)
library(SpatialML)
library(terra)
library(caret)
library(doParallel)

wd = "path/"

# 读取数据
block.data = read.csv(paste0(wd, "block.data.csv"))

Coords <- block.data[, 1:2]

# 将lc列(分类变量)转换为哑变量
df_dummies <- model.matrix(~factor(lc) - 1, data = block.data)

# 将哑变量与原始数据框合并
block.data <- cbind(block.data, df_dummies)

# 删除原始lc变量
block.data$lc <- NULL

colnames(block.data)[7:9] = c("lc30", "lc50", "lc60")

eq1 = ntl ~ pop + tirs + agbh + lc30 + lc50 + lc60

set.seed(1123)

samp <- sample(nrow(block.data), 0.80 * nrow(block.data))

train <- block.data[samp, ]

Coords_train <- train[, 1:2]

test <- block.data[-samp, ]

Coords_test <- test[, 1:2]

no_cores <- detectCores() - 1
cl = makePSOCKcluster(no_cores)
registerDoParallel(cl)

# 定义控制参数
trControl = trainControl(method = "repeatedcv", 
                         number = 3, 
                         search = "grid",
                         savePredictions = FALSE)

rf_default = train(eq1, 
                   data = train, 
                   method = "rf", 
                   metric = "Rsquared", 
                   trControl = trControl)

print(rf_default)

# 搜索最佳mtry
set.seed(1234444)
tuneGrid <- expand.grid(.mtry = c(2:6))
rf_mtry <- train(eq1,
                 data = train,
                 method = "rf",
                 metric = "Rsquared",
                 tuneGrid = tuneGrid,
                 trControl = trControl,
                 importance = TRUE,
                 nodesize = 10,
                 ntree = 500)

best_mtry <- rf_mtry$bestTune$mtry

# 搜索最佳maxnodes
tuneGrid = expand.grid(.mtry = best_mtry)

best.rsq <- -1
best.maxnodes <- 0
for (maxnodes in c(5:15)){
  set.seed(3455556)
  rf_maxnode = train(eq1, 
                     data = train, 
                     method = "rf", 
                     metric = "Rsquared", 
                     tuneGrid = tuneGrid, 
                     trControl = trControl, 
                     importance = TRUE, 
                     nodesize = 10, 
                     maxnodes = maxnodes,
                     ntree = 500)
  rsq <- rf_maxnode$finalModel$rsq
  if (rsq > best.rsq) {
    best.rsq <- rsq
    best.maxnodes <- maxnodes
  }
}

# 搜索最佳ntree
best.ntree <- -1
best.rsq <- -1
for (ntree in seq(from = 500, to = 2000, by = 500)) {
  set.seed(67777789)
  rf_maxtrees = train(eq1, 
                      data = train, 
                      method = "rf", 
                      metric = "Rsquared", 
                      tuneGrid = tuneGrid, 
                      trControl = trControl, 
                      importance = TRUE, 
                      nodesize = 10, 
                      maxnodes = best.maxnodes,
                      ntree = ntree)
  rsq <- rf_maxtrees$finalModel$rsq
  if (rsq > best.rsq) {
    best.rsq <- rsq
    best.ntree <- ntree
  }
}

abw = grf.bw(eq1, 
             train, 
             kernel = "adaptive", 
             coords = Coords_train, 
             bw.min = 10,
             bw.max = 11, 
             step = 1, 
             trees = best.ntree, 
             mtry = best_mtry, 
             importance = "impurity",
             nthreads = 3, 
             forests = FALSE, 
             weighted = FALSE)

abw$Best.BW

# 小样本数据集
block.data = structure(list(x = c(4517800L, 4517800L, 4517800L, 4517800L, 
4517800L, 4517800L, 4517800L, 4517800L, 4517800L, 4517800L, 4517800L, 
4517800L, 4517800L, 4517800L, 4517800L, 4517800L, 4517800L, 4517800L, 
4517800L, 4517800L), y = c(4308200L, 4307800L, 4307400L, 4307000L, 
4306600L, 4306200L, 4305800L, 4305400L, 4305000L, 4304600L, 4304200L, 
4303800L, 4303400L, 4303000L, 4302600L, 4302200L, 4301800L, 4301400L, 
4301000L, 4300600L), ntl = c(58.661979675293, 58.436840057373, 
55.8877563476562, 51.9307441711426, 52.9197845458984, 58.1713027954102, 
59.4091491699219, 58.0057525634766, 62.3850288391113, 43.693775177002, 
32.7657432556152, 38.814208984375, 45.2988662719727, 30.7070808410645, 
12.8714799880981, 12.8714799880981, 13.521935462951

<details>
<summary>英文:</summary>

I run a geographically weighted random forest regression model using the `R` package `SpatialML`. I separated the data set into train and test sets. When I try to calculate the bandwidth for the algorithm for the train set using the function `grf.bw`, I get this error:

    Error in dframe[DepVarName] : object of type &#39;closure&#39; is not subsettable

If I use the whole data set, the function run withouts an issue. I have found other posts with the same error message but they, mostly, focusing on shiny apps. Why this error message when I train the model using the train set?

    library(GWmodel)
    library(sp)
    library(raster)
    library(SpatialML)
    library(terra)
    library(caret)
    library(doParallel)
    
    wd = &quot;path/&quot;
    
    block.data = read.csv(paste0(wd, &quot;block.data.csv&quot;))
    
    Coords&lt;-block.data[ ,1:2]
    
    # convert lc column (categorical variable) to dummy variables
    df_dummies &lt;- model.matrix(~ factor(lc) - 1, data = block.data)
    
    # merge the dummy variables with the original data frame
    block.data &lt;- cbind(block.data, df_dummies)
    
    # remove the original lc variable
    block.data$lc &lt;- NULL
    
    colnames(block.data)[7:9] = c(&quot;lc30&quot;, &quot;lc50&quot;, &quot;lc60&quot;)
    
    eq1 = ntl ~ pop + tirs + agbh + lc30 + lc50 + lc60
    
    set.seed(1123)
    
    samp &lt;- sample(nrow(block.data), 0.80 * nrow(block.data))
    
    train &lt;- block.data[samp, ]
    
    Coords_train &lt;- train[ ,1:2]
    
    test &lt;- block.data[-samp, ]
    
    Coords_test &lt;- test[ ,1:2]
    
    no_cores &lt;- detectCores() - 1
    cl = makePSOCKcluster(no_cores)
    registerDoParallel(cl)
    
    # define the control
    trControl = trainControl(method = &quot;repeatedcv&quot;, 
                             number = 3, 
                             search = &quot;grid&quot;,
                             savePredictions = FALSE)
    
    rf_default = train(eq1, 
                       data = train, 
                       method = &quot;rf&quot;, 
                       metric = &quot;Rsquared&quot;, 
                       trControl = trControl)
    
    print(rf_default)
    
    # Search best mtry
    set.seed(1234444)
    tuneGrid &lt;- expand.grid(.mtry = c(2:6))
    rf_mtry &lt;- train(eq1,
                     data = train,
                     method = &quot;rf&quot;,
                     metric = &quot;Rsquared&quot;,
                     tuneGrid = tuneGrid,
                     trControl = trControl,
                     importance = TRUE,
                     nodesize = 10,
                     ntree = 500)
    
    best_mtry &lt;- rf_mtry$bestTune$mtry
    
    # search best maxnodes
    tuneGrid = expand.grid(.mtry = best_mtry)
    
    best.rsq &lt;- -1
    best.maxnodes &lt;- 0
    for (maxnodes in c(5:15)){
      set.seed(3455556)
      rf_maxnode = train(eq1, 
                         data = train, 
                         method = &quot;rf&quot;, 
                         metric = &quot;Rsquared&quot;, 
                         tuneGrid = tuneGrid, 
                         trControl = trControl, 
                         importance = TRUE, 
                         nodesize = 10, 
                         maxnodes = maxnodes,
                         # tuneLenght = 50,
                         ntree = 500)
      rsq &lt;- rf_maxnode$finalModel$rsq
      if (rsq &gt; best.rsq) {
        best.rsq &lt;- rsq
        best.maxnodes &lt;- maxnodes
      }
    }
    
    # search best ntree
    best.ntree &lt;- -1
    best.rsq &lt;- -1
    for (ntree in seq(from = 500, to = 2000, by = 500)) {
      set.seed(67777789)
      rf_maxtrees = train(eq1, 
                          data = train, 
                          method = &quot;rf&quot;, 
                          metric = &quot;Rsquared&quot;, 
                          tuneGrid = tuneGrid, 
                          trControl = trControl, 
                          importance = TRUE, 
                          nodesize = 10, 
                          maxnodes = best.maxnodes,
                          ntree = ntree)
      rsq &lt;- rf_maxtrees$finalModel$rsq
      if (rsq &gt; best.rsq) {
        best.rsq &lt;- rsq
        best.ntree &lt;- ntree
      }
    }
    
    abw = grf.bw(eq1, 
                 train, 
                 kernel = &quot;adaptive&quot;, 
                 coords = Coords_train, 
                 bw.min = 10,
                 bw.max = 11, 
                 step = 1, 
                 trees = best.ntree, 
                 mtry = best_mtry, 
                 importance = &quot;impurity&quot;,
                 nthreads = 3, 
                 forests = FALSE, 
                 weighted = FALSE)
    
    abw$Best.BW

A small sample data set:

    block.data = structure(list(x = c(4517800L, 4517800L, 4517800L, 4517800L, 
    4517800L, 4517800L, 4517800L, 4517800L, 4517800L, 4517800L, 4517800L, 
    4517800L, 4517800L, 4517800L, 4517800L, 4517800L, 4517800L, 4517800L, 
    4517800L, 4517800L), y = c(4308200L, 4307800L, 4307400L, 4307000L, 
    4306600L, 4306200L, 4305800L, 4305400L, 4305000L, 4304600L, 4304200L, 
    4303800L, 4303400L, 4303000L, 4302600L, 4302200L, 4301800L, 4301400L, 
    4301000L, 4300600L), ntl = c(58.661979675293, 58.436840057373, 
    55.8877563476562, 51.9307441711426, 52.9197845458984, 58.1713027954102, 
    59.4091491699219, 58.0057525634766, 62.3850288391113, 43.693775177002, 
    32.7657432556152, 38.814208984375, 45.2988662719727, 30.7070808410645, 
    12.8714799880981, 12.8714799880981, 13.5219354629517, 23.7446193695068, 
    37.4298362731934, 46.3565788269043), pop = c(75.9549713134766, 
    77.1868209838867, 77.5336608886719, 79.3608703613281, 82.8243789672852, 
    86.7496871948242, 90.5956954956055, 92.8819274902344, 92.3746337890625, 
    87.0225830078125, 80.3043441772461, 84.9410400390625, 85.2942810058594, 
    83.6485595703125, 61.1035919189453, 40.2169418334961, 53.4071769714355, 
    76.6943511962891, 80.3962097167969, 80.990608215332), tirs = c(29.3157043457031, 
    29.6056385040283, 29.3482208251953, 29.9990634918213, 30.0760860443115, 
    30.7479610443115, 32.0067825317383, 32.5420265197754, 33.4829483032227, 
    34.0894584655762, 34.0738372802734, 33.5429801940918, 34.0574531555176, 
    35.3299293518066, 34.9521026611328, 35.1631278991699, 35.4775505065918, 
    34.7371597290039, 33.3271560668945, 34.1149940490723), agbh = c(4.38038444519043, 
    5.61569929122925, 6.89490604400635, 6.30956649780273, 6.12616586685181, 
    6.96545696258545, 6.16812896728516, 1.98371779918671, 1.08961737155914, 
    0.348455667495728, 0.0777057632803917, 0.0572107657790184, 0.53607851266861, 
    0.362500905990601, 0.0665311068296432, 0.138985157012939, 0.217744708061218, 
    0.737295210361481, 3.04893350601196, 1.89484882354736), lc = c(50L, 
    50L, 50L, 50L, 50L, 50L, 50L, 50L, 30L, 60L, 30L, 60L, 50L, 60L, 
    30L, 30L, 30L, 50L, 50L, 60L)), class = &quot;data.frame&quot;, row.names = c(NA, 
    -20L))

</details>


# 答案1
**得分**: 1

@MrFlick和@Roland所述,通过将名为*train*的`data.frame`更改为*df_train*,问题得以解决。

完整的代码:

```R
wd = "path/"

block.data = read.csv(paste0(wd, "block.data.csv"))

Coords<-block.data[ ,1:2]

# 将x3转换为虚拟变量
df_dummies <- model.matrix(~ factor(lc) - 1, data = block.data)

# 将虚拟变量与原始数据框合并
block.data <- cbind(block.data, df_dummies)

# 删除原始的x3变量
block.data$lc <- NULL

colnames(block.data)[7:9] = c("lc30", "lc50", "lc60")

eq1 = ntl ~ pop + tirs + agbh + lc30 + lc50 + lc60

set.seed(1123)

samp <- sample(nrow(block.data), 0.80 * nrow(block.data))

df_train <- block.data[samp, ]

Coords_train <- df_train[ ,1:2]

test <- block.data[-samp, ]

Coords_test <- test[ ,1:2]

no_cores <- detectCores() - 1
cl = makePSOCKcluster(no_cores)
registerDoParallel(cl)

# 定义控制参数
trControl = trainControl(method = "repeatedcv", 
                         number = 3, 
                         search = "grid",
                         savePredictions = FALSE)

rf_default = train(eq1, 
                   data = df_train, 
                   method = "rf", 
                   metric = "Rsquared", 
                   trControl = trControl)

print(rf_default)

# 寻找最佳的mtry
set.seed(1234444)
tuneGrid <- expand.grid(.mtry = c(2:6))
rf_mtry <- train(eq1,
                 data = df_train,
                 method = "rf",
                 metric = "Rsquared",
                 tuneGrid = tuneGrid,
                 trControl = trControl,
                 importance = TRUE,
                 nodesize = 10,
                 ntree = 500)

best_mtry <- rf_mtry$bestTune$mtry

# 寻找最佳的maxnodes
tuneGrid = expand.grid(.mtry = best_mtry)

best.rsq <- -1
best.maxnodes <- 0
for (maxnodes in c(5:15)){
  set.seed(3455556)
  rf_maxnode = train(eq1, 
                     data = df_train, 
                     method = "rf", 
                     metric = "Rsquared", 
                     tuneGrid = tuneGrid, 
                     trControl = trControl, 
                     importance = TRUE, 
                     nodesize = 10, 
                     maxnodes = maxnodes,
                     ntree = 500)
  rsq <- rf_maxnode$finalModel$rsq
  if (rsq > best.rsq) {
    best.rsq <- rsq
    best.maxnodes <- maxnodes
  }
}

# 寻找最佳的ntree
best.ntree <- -1
best.rsq <- -1
for (ntree in seq(from = 500, to = 2000, by = 500)) {
  set.seed(67777789)
  rf_maxtrees = train(eq1, 
                      data = df_train, 
                      method = "rf", 
                      metric = "Rsquared", 
                      tuneGrid = tuneGrid, 
                      trControl = trControl, 
                      importance = TRUE, 
                      nodesize = 10, 
                      maxnodes = best.maxnodes,
                      ntree = ntree)
  rsq <- rf_maxtrees$finalModel$rsq
  if (rsq > best.rsq) {
    best.rsq <- rsq
    best.ntree <- ntree
  }
}

abw = grf.bw(eq1, 
             df_train, 
             kernel = "adaptive", 
             coords = Coords_train, 
             bw.min = 10,
             bw.max = 11, 
             step = 1, 
             trees = best.ntree, 
             mtry = best_mtry, 
             importance = "impurity",
             nthreads = 3, 
             forests = FALSE, 
             weighted = FALSE)

abw$Best.BW

# 用最佳的随机森林训练模型
grf.model <- grf(eq1,
                 dframe = df_train,
                 bw = abw$Best.BW,
                 ntree = best.ntree,
                 mtry = best_mtry,
                 kernel = "adaptive",
                 forests = TRUE,
                 coords = Coords_train,
                 nthreads = 3)

# 在测试数据中检查Rsquared
grf.model2 <- grf(eq1,
                  dframe = train,
                  bw = abw$Best.BW,
                  ntree = best.ntree,
                  mtry = best_mtry,
                  kernel = "adaptive",
                  forests = TRUE,
                  coords = Coords,
                  nthreads = 3)

grf.model2

stopCluster(cl)
英文:

As stated by @MrFlick and @Roland, by changing the data.frame called train to df_train the problem is solved.

The complete code:

wd = &quot;path/&quot;
block.data = read.csv(paste0(wd, &quot;block.data.csv&quot;))
Coords&lt;-block.data[ ,1:2]
# convert x3 to dummy variables
df_dummies &lt;- model.matrix(~ factor(lc) - 1, data = block.data)
# merge the dummy variables with the original data frame
block.data &lt;- cbind(block.data, df_dummies)
# remove the original x3 variable
block.data$lc &lt;- NULL
colnames(block.data)[7:9] = c(&quot;lc30&quot;, &quot;lc50&quot;, &quot;lc60&quot;)
eq1 = ntl ~ pop + tirs + agbh + lc30 + lc50 + lc60
set.seed(1123)
samp &lt;- sample(nrow(block.data), 0.80 * nrow(block.data))
df_train &lt;- block.data[samp, ]
Coords_train &lt;- df_train[ ,1:2]
test &lt;- block.data[-samp, ]
Coords_test &lt;- test[ ,1:2]
no_cores &lt;- detectCores() - 1
cl = makePSOCKcluster(no_cores)
registerDoParallel(cl)
# define the control
trControl = trainControl(method = &quot;repeatedcv&quot;, 
number = 3, 
search = &quot;grid&quot;,
savePredictions = FALSE)
rf_default = train(eq1, 
data = df_train, 
method = &quot;rf&quot;, 
metric = &quot;Rsquared&quot;, 
trControl = trControl)
print(rf_default)
# Search best mtry
set.seed(1234444)
tuneGrid &lt;- expand.grid(.mtry = c(2:6))
rf_mtry &lt;- train(eq1,
data = df_train,
method = &quot;rf&quot;,
metric = &quot;Rsquared&quot;,
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 10,
ntree = 500)
best_mtry &lt;- rf_mtry$bestTune$mtry
# search best maxnodes
tuneGrid = expand.grid(.mtry = best_mtry)
best.rsq &lt;- -1
best.maxnodes &lt;- 0
for (maxnodes in c(5:15)){
set.seed(3455556)
rf_maxnode = train(eq1, 
data = df_train, 
method = &quot;rf&quot;, 
metric = &quot;Rsquared&quot;, 
tuneGrid = tuneGrid, 
trControl = trControl, 
importance = TRUE, 
nodesize = 10, 
maxnodes = maxnodes,
# tuneLenght = 50,
ntree = 500)
rsq &lt;- rf_maxnode$finalModel$rsq
if (rsq &gt; best.rsq) {
best.rsq &lt;- rsq
best.maxnodes &lt;- maxnodes
}
}
# search best ntree
best.ntree &lt;- -1
best.rsq &lt;- -1
for (ntree in seq(from = 500, to = 2000, by = 500)) {
set.seed(67777789)
rf_maxtrees = train(eq1, 
data = df_train, 
method = &quot;rf&quot;, 
metric = &quot;Rsquared&quot;, 
tuneGrid = tuneGrid, 
trControl = trControl, 
importance = TRUE, 
nodesize = 10, 
maxnodes = best.maxnodes,
ntree = ntree)
rsq &lt;- rf_maxtrees$finalModel$rsq
if (rsq &gt; best.rsq) {
best.rsq &lt;- rsq
best.ntree &lt;- ntree
}
}
abw = grf.bw(eq1, 
df_train, 
kernel = &quot;adaptive&quot;, 
coords = Coords_train, 
bw.min = 10,
bw.max = 11, 
step = 1, 
trees = best.ntree, 
mtry = best_mtry, 
importance = &quot;impurity&quot;,
nthreads = 3, 
forests = FALSE, 
weighted = FALSE)
abw$Best.BW
# train the model with the optimum rf
grf.model &lt;- grf(eq1,
dframe = train,
bw = abw$Best.BW,
ntree = best.ntree,
mtry = best_mtry,
kernel = &quot;adaptive&quot;,
forests = TRUE,
coords = Coords_train,
nthreads = 3)
# check Rsquared in the test data
grf.model2 &lt;- grf(eq1,
dframe = train,
bw = abw$Best.BW,
ntree = best.ntree,
mtry = best_mtry,
kernel = &quot;adaptive&quot;,
forests = TRUE,
coords = Coords,
nthreads = 3)
grf.model2
stopCluster(cl)

huangapple
  • 本文由 发表于 2023年2月23日 23:00:52
  • 转载请务必保留本文链接:https://go.coder-hub.com/75546569.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定