英文:
Putting the Elements of a Cluster into a Dataframe
问题
以下是代码部分的中文翻译:
# 使用函数主成分分析(functional principal component analysis)来降低预1989年时间序列的维度。
# 然后,在fPCA得分上进行聚类。考虑以下问题的设置:
library(splines)
library(fda)
library(dplyr)
set.seed(1420)
data <- read.csv("https://raw.githubusercontent.com/synth-inference/synthdid/master/data/california_prop99.csv",
header=TRUE, sep=";")
# 导入我们的数据集
data = subset(data, select = c(State, Year, PacksPerCapita, treated))
# 限制数据集的结果、时间和面板
preperiod = nrow(data[data$treated == 1 & data$State == "California", ])
# 现在我们计算T_0期数...
data <- data %>% select(-treated)
# 不再需要治疗变量。
nptperiod = preperiod + 2
if (preperiod < 3) {
# 如果预期期数少于3个,则停止。
stop("要打印的错误消息")
}
# 步骤0.2:将我们的数据框重新整形为宽格式。
trainframe = reshape(data, idvar = "State", timevar = "Year", direction = "wide")
# 步骤1:定义X作为干预前期的
# 治疗单位和非治疗单位的数据集
X = trainframe[, 2:nptperiod]
last = nptperiod - 1
x = seq(0, 1, length = last)
# 步骤1.01:为特征函数定义样条
splinebasis = create.bspline.basis(c(0, 1), 10)
smooth = smooth.basis(x, t(X), splinebasis)
# 步骤1.2:为所有单位计算解释数据变异性的FPC得分xi
Xfun = smooth$fd
# 对其进行PCA
pca = pca.fd(Xfun, 10)
var.pca = cumsum(pca$varprop)
nharm = sum(var.pca < 0.95) + 1
pc = pca.fd(Xfun, nharm)
# 步骤2:在FPC得分上应用K-Means算法,并提取捐赠者池
# 将我们的fPCA得分转化为矩阵
cluster_x = as.matrix(pc$scores)
cluster_x <- scale(cluster_x)
# 2.1:计算轮廓统计量
library(cluster)
k.max = 8
sil = rep(0, k.max)
# 计算平均轮廓宽度
for (i in 2:k.max) {
tmp = kmeans(cluster_x, centers = i, nstart = 10)
ss <- silhouette(tmp$cluster, dist(cluster_x))
# 我们的轮廓统计量
sil[i] <- mean(ss[, 3])
}
# 现在,我们得到具有最大轮廓统计量的向量行
optnum = which.max(sil)
# 2.2:K-Means聚类分析
fit <- kmeans(cluster_x, optnum) # 5个簇的解决方案
# 5个簇的解决方案?
trainframe <- data.frame(trainframe, fit$cluster)
cols = ncol(trainframe)
colm1 = cols - 1
代码中的fit$cluster[11]
用于获取第11个元素,可能需要检查数组索引是否正确,确保它指向正确的集群编号。如果你怀疑问题与此有关,可以检查一下。
英文:
Below I use functional principal component analysis to reduce the dimensionality of a pre-1989 time series. After, I cluster over the fPCA scores. Consider the following, which sets up the problem:
library(splines)
library(fda)
library(dplyr)
set.seed(1420)
data <-read.csv("https://raw.githubusercontent.com/synth-inference/synthdid/master/data/california_prop99.csv",
header=TRUE, sep=";")
# Imports our dataset
data = subset(data, select = c(State, Year,PacksPerCapita,treated))
# Restricts our dataset to the outcome, time, and panel
preperiod = nrow(data[data$treated== 1 & data$State=="California", ])
# Now we count the number of T_0 periods...
data <-data %>% select(-(treated))
#The treatment variable is no longer needed.
nptperiod=preperiod+2
if(preperiod < 3){
# If we have less than 3 pre-periods, stop.
stop("error message to print")
}
# Step 0.2: Rehshape our dataframe to wide.
trainframe = reshape(data, idvar = "State", timevar = "Year", direction = "wide")
# Step 1: Define X as the pre-intervention
#period for both treated unit and
#non-treated units in the data set
X = trainframe[,2:nptperiod]
last = nptperiod-1
x = seq(0,1,length=last)
# Step 1.01: Define our spline for the eigenfunction
splinebasis = create.bspline.basis(c(0,1),10)
smooth = smooth.basis(x,t(X),splinebasis)
# Step 1.2: Compute FPC scores xi for all units
# that explain most of the variation in the data
Xfun = smooth$fd
#Do PCA over it
pca = pca.fd(Xfun, 10)
var.pca = cumsum(pca$varprop)
nharm = sum(var.pca < 0.95) + 1
pc = pca.fd(Xfun, nharm)
# Step 2: Apply K-Means algorithm on
# FPC-scores and extract the donor pool
#Make our fPCA scores as matrices
cluster_x = as.matrix(pc$scores)
cluster_x <- scale(cluster_x)
# 2.1: Calculating the Silhouette statistics
library(cluster)
k.max = 8
sil = rep(0, k.max)
# Compute the average silhouette width
for(i in 2:k.max){
tmp = kmeans(cluster_x, centers = i, nstart = 10)
ss <- silhouette(tmp$cluster, dist(cluster_x))
# Our Silhouette stat
sil[i] <- mean(ss[, 3])
}
# Now, we get the row of the vector
# that has the largest Silhouette stat
optnum = which.max(sil)
# 2.2: K-Means Cluster Analysis
fit <- kmeans(cluster_x, optnum) # 5 cluster solution
# 5 cluster solution?
trainframe <- data.frame(trainframe, fit$cluster)
cols=ncol(trainframe)
colm1 = cols-1
The cluster California (the treated unit) belongs in is cluster 1, and I want to keep states which only are in cluster 1. But, when I check to see if it's in new_data
after clustering like so
trainframe[,c(1,cols)]
treat_cluster = fit$cluster[11]
new_data = trainframe[trainframe[,cols]==treat_cluster,1:colm1]
istr= sum(str_detect(new_data$fullname, '^California$')) > 0
if(istr == "FALSE"){
# If California isn't there, stop.
stop("You MUST have California in the cluster.")
}
R tells me that California isn't found, which it should be. How do I save cluster 1 into its own dataframe? I suspect the issue has to do with treat_cluster = fit$cluster[11]
.
答案1
得分: 0
以下是您请求的翻译部分:
"library(splines)
library(fda)
library(ggplot2)
library(dplyr)
library(stringr)
set.seed(1420)
data <-read.csv("https://raw.githubusercontent.com/synth-inference/synthdid/master/data/california_prop99.csv",
header=TRUE, sep=";")
Imports our dataset
data = subset(data, select = c(State, Year,PacksPerCapita,treated))
Restricts our dataset to the outcome, time, and panel
preperiod = nrow(data[data$treated== 0 & data$State=="California", ])
Now we count the number of T_0 periods...
data <-data %>% select(-(treated))
#The treatment variable is no longer needed.
nptperiod=preperiod+2
if(preperiod < 3){
If we have less than 3 pre-periods, stop.
stop("error message to print")
}
Step 0.2: Rehshape our dataframe to wide.
trainframe = reshape(data, idvar = "State", timevar = "Year", direction = "wide")
Step 1: Define X as the pre-intervention
#period for both treated unit and
#non-treated units in the data set
X = trainframe[,2:nptperiod]
last = nptperiod-1
x = seq(0,1,length=last)
Step 1.01: Define our spline for the eigenfunction
splinebasis = create.bspline.basis(c(0,1),10)
smooth = smooth.basis(x,t(X),splinebasis)
Step 1.2: Compute FPC scores xi for all units
that explain most of the variation in the data
Xfun = smooth$fd
#Do PCA over it
pca = pca.fd(Xfun, 10)
var.pca = cumsum(pca$varprop)
nharm = sum(var.pca < 0.95) + 1
pc = pca.fd(Xfun, nharm)"
"在这里,我们估计了功能性主成分。接下来,我们对它们应用了K均值聚类算法。
Step 2: Apply K-Means algorithm on
FPC-scores and extract the donor pool
#Make our fPCA scores as matrices
cluster_x = as matrix(pc$scores)
cluster_x <- scale(cluster_x)
2.1: Calculating the Silhouette statistics
library(cluster)
k.max = 8
sil = rep(0, k.max)
Compute the average silhouette width
for(i in 2:k.max){
tmp = kmeans(cluster_x, centers = i, nstart = 10)
ss <- silhouette(tmp$cluster, dist(cluster_x))
Our Silhouette stat
sil[i] <- mean(ss[, 3])
}
Now, we get the row of the vector
that has the largest Silhouette stat
optnum = which max(sil)
2.2: K-Means Cluster Analysis
fit <- kmeans(cluster_x, optnum) # 5 cluster solution
trainframe <- data frame(trainframe, fit$cluster)
cols=ncol(trainframe)
colm1 = cols-1
cluster_num = sum(trainframe[which(trainframe[[1]] =='California'), cols])"
"最后一步是关键的 - 如果第1列,我们的单位列,是加利福尼亚,我提取了集群列的行值。接下来,我们将加利福尼亚所在的集群与fit$cluster
中的集群顺序进行匹配。
trainframe[,c(1,cols)]
position = match(c(cluster_num),fit$cluster)
treat_cluster = fit$cluster(position)
new_data = trainframe[trainframe[,cols]==treat_cluster,1:colm1]"
"我们现在有一个由最佳捐赠者子集组成的数据框。"
英文:
As promised, here's the solution. We begin with the setup
library(splines)
library(fda)
library(ggplot2)
library(dplyr)
library(stringr)
set.seed(1420)
data <-read.csv("https://raw.githubusercontent.com/synth-inference/synthdid/master/data/california_prop99.csv",
header=TRUE, sep=";")
# Imports our dataset
data = subset(data, select = c(State, Year,PacksPerCapita,treated))
# Restricts our dataset to the outcome, time, and panel
preperiod = nrow(data[data$treated== 0 & data$State=="California", ])
# Now we count the number of T_0 periods...
data <-data %>% select(-(treated))
#The treatment variable is no longer needed.
nptperiod=preperiod+2
if(preperiod < 3){
# If we have less than 3 pre-periods, stop.
stop("error message to print")
}
# Step 0.2: Rehshape our dataframe to wide.
trainframe = reshape(data, idvar = "State", timevar = "Year", direction = "wide")
# Step 1: Define X as the pre-intervention
#period for both treated unit and
#non-treated units in the data set
X = trainframe[,2:nptperiod]
last = nptperiod-1
x = seq(0,1,length=last)
# Step 1.01: Define our spline for the eigenfunction
splinebasis = create.bspline.basis(c(0,1),10)
smooth = smooth.basis(x,t(X),splinebasis)
# Step 1.2: Compute FPC scores xi for all units
# that explain most of the variation in the data
Xfun = smooth$fd
#Do PCA over it
pca = pca.fd(Xfun, 10)
var.pca = cumsum(pca$varprop)
nharm = sum(var.pca < 0.95) + 1
pc = pca.fd(Xfun, nharm)
where we estimate the functional principal components. Next we k-means cluster over them.
# Step 2: Apply K-Means algorithm on
# FPC-scores and extract the donor pool
#Make our fPCA scores as matrices
cluster_x = as.matrix(pc$scores)
cluster_x <- scale(cluster_x)
# 2.1: Calculating the Silhouette statistics
library(cluster)
k.max = 8
sil = rep(0, k.max)
# Compute the average silhouette width
for(i in 2:k.max){
tmp = kmeans(cluster_x, centers = i, nstart = 10)
ss <- silhouette(tmp$cluster, dist(cluster_x))
# Our Silhouette stat
sil[i] <- mean(ss[, 3])
}
# Now, we get the row of the vector
# that has the largest Silhouette stat
optnum = which.max(sil)
# 2.2: K-Means Cluster Analysis
fit <- kmeans(cluster_x, optnum) # 5 cluster solution
trainframe <- data.frame(trainframe, fit$cluster)
cols=ncol(trainframe)
colm1 = cols-1
cluster_num = sum(trainframe[which(trainframe[[1]] =='California'), cols])
The last step here is key- I extract the row-value of the cluster column if column 1, our unit column, is California. Next, we match the cluster California falls within with its place in fit$cluster
, the list of clusters in order.
trainframe[,c(1,cols)]
position = match(c(cluster_num),fit$cluster)
treat_cluster = fit$cluster[position]
new_data = trainframe[trainframe[,cols]==treat_cluster,1:colm1]
We now have a dataframe consisting of the optimal subset of donors.
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论