将集群的元素放入数据框中

huangapple go评论64阅读模式
英文:

Putting the Elements of a Cluster into a Dataframe

问题

以下是代码部分的中文翻译:

# 使用函数主成分分析(functional principal component analysis)来降低预1989年时间序列的维度。
# 然后,在fPCA得分上进行聚类。考虑以下问题的设置:

library(splines)
library(fda)
library(dplyr)

set.seed(1420)
data <- read.csv("https://raw.githubusercontent.com/synth-inference/synthdid/master/data/california_prop99.csv",
                header=TRUE, sep=";")

# 导入我们的数据集
data = subset(data, select = c(State, Year, PacksPerCapita, treated))
# 限制数据集的结果、时间和面板
preperiod = nrow(data[data$treated == 1 & data$State == "California", ])
# 现在我们计算T_0期数...

data <- data %>% select(-treated)
# 不再需要治疗变量。
nptperiod = preperiod + 2
if (preperiod < 3) {
  # 如果预期期数少于3个,则停止。
  stop("要打印的错误消息")   
}
# 步骤0.2:将我们的数据框重新整形为宽格式。
trainframe = reshape(data, idvar = "State", timevar = "Year", direction = "wide")
# 步骤1:定义X作为干预前期的
# 治疗单位和非治疗单位的数据集
X = trainframe[, 2:nptperiod]
last = nptperiod - 1
x = seq(0, 1, length = last)

# 步骤1.01:为特征函数定义样条
splinebasis = create.bspline.basis(c(0, 1), 10)
smooth = smooth.basis(x, t(X), splinebasis)
# 步骤1.2:为所有单位计算解释数据变异性的FPC得分xi
Xfun = smooth$fd
# 对其进行PCA
pca = pca.fd(Xfun, 10)
var.pca = cumsum(pca$varprop)
nharm = sum(var.pca < 0.95) + 1
pc = pca.fd(Xfun, nharm)

# 步骤2:在FPC得分上应用K-Means算法,并提取捐赠者池
# 将我们的fPCA得分转化为矩阵
cluster_x = as.matrix(pc$scores)
cluster_x <- scale(cluster_x)
# 2.1:计算轮廓统计量
library(cluster)
k.max = 8
sil = rep(0, k.max)
# 计算平均轮廓宽度
for (i in 2:k.max) {
  tmp = kmeans(cluster_x, centers = i, nstart = 10)
  ss <- silhouette(tmp$cluster, dist(cluster_x))
  # 我们的轮廓统计量
  sil[i] <- mean(ss[, 3])
}
# 现在,我们得到具有最大轮廓统计量的向量行
optnum = which.max(sil)
# 2.2:K-Means聚类分析
fit <- kmeans(cluster_x, optnum) # 5个簇的解决方案
# 5个簇的解决方案?
trainframe <- data.frame(trainframe, fit$cluster)
cols = ncol(trainframe)
colm1 = cols - 1

代码中的fit$cluster[11]用于获取第11个元素,可能需要检查数组索引是否正确,确保它指向正确的集群编号。如果你怀疑问题与此有关,可以检查一下。

英文:

Below I use functional principal component analysis to reduce the dimensionality of a pre-1989 time series. After, I cluster over the fPCA scores. Consider the following, which sets up the problem:

library(splines)
library(fda)
library(dplyr)
set.seed(1420)
data &lt;-read.csv(&quot;https://raw.githubusercontent.com/synth-inference/synthdid/master/data/california_prop99.csv&quot;,
header=TRUE, sep=&quot;;&quot;)
# Imports our dataset
data = subset(data, select = c(State, Year,PacksPerCapita,treated))
# Restricts our dataset to the outcome, time, and panel
preperiod = nrow(data[data$treated== 1 &amp; data$State==&quot;California&quot;, ])
# Now we count the number of T_0 periods...
data &lt;-data %&gt;% select(-(treated))
#The treatment variable is no longer needed.
nptperiod=preperiod+2
if(preperiod &lt; 3){
# If we have less than 3 pre-periods, stop.
stop(&quot;error message to print&quot;)   
}
# Step 0.2: Rehshape our dataframe to wide.
trainframe = reshape(data, idvar = &quot;State&quot;, timevar = &quot;Year&quot;, direction = &quot;wide&quot;)
# Step 1: Define X as the pre-intervention
#period for both treated unit and
#non-treated units in the data set
X = trainframe[,2:nptperiod]
last = nptperiod-1
x = seq(0,1,length=last)
# Step 1.01: Define our spline for the eigenfunction
splinebasis = create.bspline.basis(c(0,1),10)
smooth = smooth.basis(x,t(X),splinebasis)
# Step 1.2: Compute FPC scores xi for all units
# that explain most of the variation in the data
Xfun = smooth$fd
#Do PCA over it
pca = pca.fd(Xfun, 10)
var.pca = cumsum(pca$varprop)
nharm = sum(var.pca &lt; 0.95) + 1
pc = pca.fd(Xfun, nharm)
# Step 2: Apply K-Means algorithm on
# FPC-scores and extract the donor pool
#Make our fPCA scores as matrices
cluster_x = as.matrix(pc$scores)
cluster_x &lt;- scale(cluster_x)
# 2.1: Calculating the Silhouette statistics
library(cluster)
k.max = 8
sil = rep(0, k.max)
# Compute the average silhouette width 
for(i in 2:k.max){
tmp = kmeans(cluster_x, centers = i, nstart = 10)
ss &lt;- silhouette(tmp$cluster, dist(cluster_x))
# Our Silhouette stat
sil[i] &lt;- mean(ss[, 3])
}
# Now, we get the row of the vector
# that has the largest Silhouette stat
optnum = which.max(sil)
# 2.2: K-Means Cluster Analysis
fit &lt;- kmeans(cluster_x, optnum) # 5 cluster solution
# 5 cluster solution?
trainframe &lt;- data.frame(trainframe, fit$cluster)
cols=ncol(trainframe)
colm1 = cols-1

The cluster California (the treated unit) belongs in is cluster 1, and I want to keep states which only are in cluster 1. But, when I check to see if it's in new_data after clustering like so

trainframe[,c(1,cols)]
treat_cluster = fit$cluster[11]
new_data = trainframe[trainframe[,cols]==treat_cluster,1:colm1]
istr= sum(str_detect(new_data$fullname, &#39;^California$&#39;)) &gt; 0
if(istr == &quot;FALSE&quot;){
# If California isn&#39;t there, stop.
stop(&quot;You MUST have California in the cluster.&quot;)   
}

R tells me that California isn't found, which it should be. How do I save cluster 1 into its own dataframe? I suspect the issue has to do with treat_cluster = fit$cluster[11].

答案1

得分: 0

以下是您请求的翻译部分:

"library(splines)
library(fda)
library(ggplot2)
library(dplyr)
library(stringr)

set.seed(1420)
data <-read.csv("https://raw.githubusercontent.com/synth-inference/synthdid/master/data/california_prop99.csv",
header=TRUE, sep=";")

Imports our dataset

data = subset(data, select = c(State, Year,PacksPerCapita,treated))

Restricts our dataset to the outcome, time, and panel

preperiod = nrow(data[data$treated== 0 & data$State=="California", ])

Now we count the number of T_0 periods...

data <-data %>% select(-(treated))
#The treatment variable is no longer needed.
nptperiod=preperiod+2
if(preperiod < 3){

If we have less than 3 pre-periods, stop.

stop("error message to print")
}

Step 0.2: Rehshape our dataframe to wide.

trainframe = reshape(data, idvar = "State", timevar = "Year", direction = "wide")

Step 1: Define X as the pre-intervention

#period for both treated unit and
#non-treated units in the data set
X = trainframe[,2:nptperiod]
last = nptperiod-1
x = seq(0,1,length=last)

Step 1.01: Define our spline for the eigenfunction

splinebasis = create.bspline.basis(c(0,1),10)
smooth = smooth.basis(x,t(X),splinebasis)

Step 1.2: Compute FPC scores xi for all units

that explain most of the variation in the data

Xfun = smooth$fd
#Do PCA over it
pca = pca.fd(Xfun, 10)
var.pca = cumsum(pca$varprop)
nharm = sum(var.pca < 0.95) + 1
pc = pca.fd(Xfun, nharm)"

"在这里,我们估计了功能性主成分。接下来,我们对它们应用了K均值聚类算法。

Step 2: Apply K-Means algorithm on

FPC-scores and extract the donor pool

#Make our fPCA scores as matrices
cluster_x = as matrix(pc$scores)
cluster_x <- scale(cluster_x)

2.1: Calculating the Silhouette statistics

library(cluster)
k.max = 8
sil = rep(0, k.max)

Compute the average silhouette width

for(i in 2:k.max){
tmp = kmeans(cluster_x, centers = i, nstart = 10)
ss <- silhouette(tmp$cluster, dist(cluster_x))

Our Silhouette stat

sil[i] <- mean(ss[, 3])
}

Now, we get the row of the vector

that has the largest Silhouette stat

optnum = which max(sil)

2.2: K-Means Cluster Analysis

fit <- kmeans(cluster_x, optnum) # 5 cluster solution
trainframe <- data frame(trainframe, fit$cluster)
cols=ncol(trainframe)
colm1 = cols-1

cluster_num = sum(trainframe[which(trainframe[[1]] =='California'), cols])"

"最后一步是关键的 - 如果第1列,我们的单位列,是加利福尼亚,我提取了集群列的行值。接下来,我们将加利福尼亚所在的集群与fit$cluster中的集群顺序进行匹配。

trainframe[,c(1,cols)]

position = match(c(cluster_num),fit$cluster)

treat_cluster = fit$cluster(position)

new_data = trainframe[trainframe[,cols]==treat_cluster,1:colm1]"

"我们现在有一个由最佳捐赠者子集组成的数据框。"

英文:

As promised, here's the solution. We begin with the setup

library(splines)
library(fda)
library(ggplot2)
library(dplyr)
library(stringr)
set.seed(1420)
data &lt;-read.csv(&quot;https://raw.githubusercontent.com/synth-inference/synthdid/master/data/california_prop99.csv&quot;,
header=TRUE, sep=&quot;;&quot;)
# Imports our dataset
data = subset(data, select = c(State, Year,PacksPerCapita,treated))
# Restricts our dataset to the outcome, time, and panel
preperiod = nrow(data[data$treated== 0 &amp; data$State==&quot;California&quot;, ])
# Now we count the number of T_0 periods...
data &lt;-data %&gt;% select(-(treated))
#The treatment variable is no longer needed.
nptperiod=preperiod+2
if(preperiod &lt; 3){
# If we have less than 3 pre-periods, stop.
stop(&quot;error message to print&quot;)   
}
# Step 0.2: Rehshape our dataframe to wide.
trainframe = reshape(data, idvar = &quot;State&quot;, timevar = &quot;Year&quot;, direction = &quot;wide&quot;)
# Step 1: Define X as the pre-intervention
#period for both treated unit and
#non-treated units in the data set
X = trainframe[,2:nptperiod]
last = nptperiod-1
x = seq(0,1,length=last)
# Step 1.01: Define our spline for the eigenfunction
splinebasis = create.bspline.basis(c(0,1),10)
smooth = smooth.basis(x,t(X),splinebasis)
# Step 1.2: Compute FPC scores xi for all units
# that explain most of the variation in the data
Xfun = smooth$fd
#Do PCA over it
pca = pca.fd(Xfun, 10)
var.pca = cumsum(pca$varprop)
nharm = sum(var.pca &lt; 0.95) + 1
pc = pca.fd(Xfun, nharm)

where we estimate the functional principal components. Next we k-means cluster over them.

# Step 2: Apply K-Means algorithm on
# FPC-scores and extract the donor pool
#Make our fPCA scores as matrices
cluster_x = as.matrix(pc$scores)
cluster_x &lt;- scale(cluster_x)
# 2.1: Calculating the Silhouette statistics
library(cluster)
k.max = 8
sil = rep(0, k.max)
# Compute the average silhouette width 
for(i in 2:k.max){
tmp = kmeans(cluster_x, centers = i, nstart = 10)
ss &lt;- silhouette(tmp$cluster, dist(cluster_x))
# Our Silhouette stat
sil[i] &lt;- mean(ss[, 3])
}
# Now, we get the row of the vector
# that has the largest Silhouette stat
optnum = which.max(sil)
# 2.2: K-Means Cluster Analysis
fit &lt;- kmeans(cluster_x, optnum) # 5 cluster solution
trainframe &lt;- data.frame(trainframe, fit$cluster)
cols=ncol(trainframe)
colm1 = cols-1
cluster_num = sum(trainframe[which(trainframe[[1]] ==&#39;California&#39;), cols])

The last step here is key- I extract the row-value of the cluster column if column 1, our unit column, is California. Next, we match the cluster California falls within with its place in fit$cluster, the list of clusters in order.

trainframe[,c(1,cols)]
position = match(c(cluster_num),fit$cluster)
treat_cluster = fit$cluster[position]
new_data = trainframe[trainframe[,cols]==treat_cluster,1:colm1]

We now have a dataframe consisting of the optimal subset of donors.

huangapple
  • 本文由 发表于 2023年4月4日 05:46:54
  • 转载请务必保留本文链接:https://go.coder-hub.com/75924005.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定