英文:
Knn algorithm in regression
问题
我有一个数据集,我正在尝试使用KNN来基于“Number of plants”来预测“Total kg”。
from sklearn.neighbors import KNeighborsClassifier
df = pd.read_csv(r'final2.csv')
X = df.drop('Total kg', axis=1)
Y = df[["Total kg"]]
X = X.values
Y = Y.values
knn = KNeighborsClassifier(n_neighbors=1)
prediction = knn.fit(X, Y)
我收到以下错误:ValueError: 未知的标签类型:'continuous'。
有没有办法使用KNN来处理连续变量?
英文:
Number of plants | Total kg |
---|---|
900 | 7.565 |
1440 | 17.808 |
2340 | 25.373 |
900 | 10.340 |
900 | 10.610 |
I have a data set and I am trying to predict the "Total kg" based on the "Number of plants" using KNN.
from sklearn.neighbors import KNeighborsClassifier
df = pd.read_csv(r'final2.csv')
X = df.drop('Total kg', axis=1)
Y = df[["Total kg"]]
X=X.values
Y=Y.values
knn = KNeighborsClassifier(n_neighbors=1)
prediction = knn.fit(X,Y)
I get the following error: ValueError: Unknown label type: 'continuous'
Is there a way to use KNN for continuous variables?
答案1
得分: 1
不应在回归任务中使用分类器。尝试:
knn = KNeighborsRegressor(n_neighbors=1)
英文:
You should not use a classifier for a regression task. Try:
knn = KNeighborsRegressor(n_neighbors=1)
答案2
得分: 1
import pandas as pd
import import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
Scaling
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing from LabelEncoder
Train Test Split
from sklearn.model_selection import train_test_split
Models
import torch
import torch.nn as nn
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
Metrics
from sklearn.metrics import accuracy_score, classification_report, roc_curve
Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("heart.csv")
Preview of the first 10 rows of the data
df.head(10)
df.dtypes
Shape of data
print("The shape of the dataset is:", df.shape)
Get Categorical Column
string_col = df.select_dtypes("string").columns.to_list()
Check the number of unique values in each column
dict = {}
for i in list(df.columns):
dict[i] = df[i].value_counts().shape[0]
pd.DataFrame(dict, index=["unique count"]).transpose()
Check duplicated
df[df.duplicated()]
Remove duplicate
df.drop_duplicates(keep='first', inplace=True)
Check new shape
print('Number of rows are', df.shape[0], 'and number of columns are', df.shape[1])
num_col = df.columns.to_list()
for col in string_col:
num_col.remove(col)
num_col.remove("output")
df[string_col].head()
for col in string_col:
print(f"The distribution of categorical values in the {col} is:")
print(df[col].value_counts())
Check statistical data
df.describe()
df.coor()
Separate the column in categorical and continuous
cat_cols = ['sex', 'exng', 'caa', 'cp', 'fbs', 'restecg', 'slp', 'thall']
con_cols = ["age", "trtbps", "chol", "thalachh", "oldpeak"]
target_col = ["output"]
print("The categorical cols are:", cat_cols)
print("The continuous cols are:", con_cols)
print("The target variable is:", target_col)
df[con_cols].describe().transpose()
EDA
Pair plot
plt.figure(figsize=(20, 20))
sns.pairplot(df, hue='output', palette=["#8000ff", "#da8829"])
plt.show()
Violin plot
plt.figure(figsize=(18, 10))
plt.subplot(2, 3, 1)
sns.violinplot(x='sex', y='output', data=df)
plt.xticks(rotation=45)
plt.subplot(2, 3, 2)
sns.violinplot(x='thall', y='output', data=df)
plt.xticks(rotation=45)
plt.subplot(2, 3, 3)
sns.violinplot(x='exng', y='output', data=df)
plt.xticks(rotation=45)
plt.subplot(2, 3, 4)
sns.violinplot(x='restecg', y='output', data=df)
plt.xticks(rotation=45)
plt.subplot(2, 3, 5)
sns.violinplot(x='cp', y='output', data=df)
plt.xticks(rotation=45)
plt.subplot(2, 3, 6)
sns.violinplot(x='fbs', y='output', data=df)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Heatmap
px.imshow(df.corr(), title="Correlation Plot of the Heart Failure Prediction")
plt.figure(figsize=(16, 8))
sns.heatmap(df.corr(), annot=True, cmap='gnuplot2_r', fmt='.1f)
Count plot of categorical features
fig = plt.figure(figsize=(18, 15))
gs = fig.add_gridspec(3, 3)
gs.update(wspace=0.5, hspace=0.25)
ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])
ax3 = fig.add_subplot(gs[1, 0])
ax4 = fig.add_subplot(gs[1, 1])
ax5 = fig.add_subplot(gs[1, 2])
ax6 = fig.add_subplot(gs[2, 0])
ax7 = fig.add_subplot(gs[2, 1])
ax8 = fig.add_subplot(gs[2, 2])
background_color = "#ffe6e6"
color_palette = ["#800000", "#8000ff", "#6aac90", "#5833ff", "#da8829"]
fig.patch.set_facecolor(background_color)
ax0.set_facecolor(background_color)
ax1.set_facecolor(background_color)
ax2.set_facecolor(background_color)
ax3.set_facecolor(background_color)
ax4.set_facecolor(background_color)
ax5.set_facecolor(background_color)
ax6.set_facecolor(background_color)
ax7.set_facecolor(background_color)
ax8.set_facecolor(background_color)
Title of the plot
ax0.spines["bottom"].set_visible(False)
ax0.spines["left"].set_visible(False)
ax0.spines["top"].set_visible(False)
ax0.spines["right"].set_visible(False)
ax0.tick_params(left=False, bottom=False)
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.text(0.5, 0.5,
'Count plot for various\n categorical features\n_________________',
horizontalalignment='center',
verticalalignment='center',
fontsize=18, fontweight='bold',
fontfamily='serif',
color="#000000")
Sex count
ax1.text(0.3, 220, 'Sex', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1, 5))
sns.countplot(ax=ax1, data=df, x='sex', palette=color_palette)
ax1.set_xlabel("")
ax1.set_ylabel("")
Exng count
ax2.text(0.3, 220, 'Exng', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1, 5))
sns.countplot(ax=ax2, data=df, x='exng', palette=color_palette)
ax2.set_xlabel("")
ax2.set_ylabel("")
Caa count
ax3.text(1.5, 200, 'Caa', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax3.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1, 5))
sns.countplot(ax=ax3, data=df, x='caa', palette=color_palette)
ax3.set_xlabel("")
ax3.set_ylabel("")
英文:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Scaling
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder
# Train Test Split
from sklearn.model_selection import train_test_split
# Models
import torch
import torch.nn as nn
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Metrics
from sklearn.metrics import accuracy_score, classification_report, roc_curve
# Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("heart.csv")
# Preview of the first 10 rows of the data
df.head(10)
df.dtypes
#shape of data
print("The shape of the dataset is : ", df.shape)
**Get Categorical Column**
string_col=df.select_dtypes("string").columns.to_list()
**#Check the number of unique values in each column**
dict = {}
for i in list(df.columns):
dict[i] = df[i].value_counts().shape[0]
pd.DataFrame(dict,index=["unique count"]).transpose()
**check duplicated**
df[df.duplicated()]
**Remove duplicate**
df.drop_duplicates(keep='first',inplace=True)
**check new shape**
print('Number of rows are',df.shape[0], 'and number of columns are ',df.shape[1])
num_col=df.columns.to_list()
for col in string_col:
num_col.remove(col)
num_col.remove("output")
df[string_col].head()
for col in string_col:
print(f"The distribution of categorical valeus in the {col} is : ")
print(df[col].value_counts())
**check statistical data**
df.describe()
df.coor()
**seperate the column in categorical and continious**
cat_cols = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
con_cols = ["age","trtbps","chol","thalachh","oldpeak"]
target_col = ["output"]
print("The categorial cols are : ", cat_cols)
print("The continuous cols are : ", con_cols)
print("The target variable is : ", target_col)
df[con_cols].describe().transpose()
# EDA#
pair plot
plt.figure(figsize=(20, 20))
sns.pairplot(df,hue='output',palette = ["#8000ff","#da8829"])
plt.show()
violin plot
plt.figure(figsize=(18, 10))
plt.subplot(2,3,1)
sns.violinplot(x = 'sex', y = 'output', data = df)
plt.xticks(rotation=45)
plt.subplot(2,3,2)
sns.violinplot(x = 'thall', y = 'output', data = df)
plt.xticks(rotation=45)
plt.subplot(2,3,3)
sns.violinplot(x = 'exng', y = 'output', data = df)
plt.xticks(rotation=45)
plt.subplot(2,3,4)
sns.violinplot(x = 'restecg', y = 'output', data = df)
plt.xticks(rotation=45)
plt.subplot(2,3,5)
sns.violinplot(x = 'cp', y = 'output', data = df)
plt.xticks(rotation=45)
plt.subplot(2,3,6)
sns.violinplot(x = 'fbs', y = 'output', data = df)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
heatmap
px.imshow(df.corr(),title="Correlation Plot of the Heat Failure Prediction")
plt.figure(figsize= (16, 8))
sns.heatmap(df.corr(), annot = True, cmap= 'gnuplot2_r', fmt= '.1f');
# Count plot of categorical features#
fig = plt.figure(figsize=(18,15))
gs = fig.add_gridspec(3,3)
gs.update(wspace=0.5, hspace=0.25)
ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[0,2])
ax3 = fig.add_subplot(gs[1,0])
ax4 = fig.add_subplot(gs[1,1])
ax5 = fig.add_subplot(gs[1,2])
ax6 = fig.add_subplot(gs[2,0])
ax7 = fig.add_subplot(gs[2,1])
ax8 = fig.add_subplot(gs[2,2])
background_color = "#ffe6e6"
color_palette = ["#800000","#8000ff","#6aac90","#5833ff","#da8829"]
fig.patch.set_facecolor(background_color)
ax0.set_facecolor(background_color)
ax1.set_facecolor(background_color)
ax2.set_facecolor(background_color)
ax3.set_facecolor(background_color)
ax4.set_facecolor(background_color)
ax5.set_facecolor(background_color)
ax6.set_facecolor(background_color)
ax7.set_facecolor(background_color)
ax8.set_facecolor(background_color)
# Title of the plot
ax0.spines["bottom"].set_visible(False)
ax0.spines["left"].set_visible(False)
ax0.spines["top"].set_visible(False)
ax0.spines["right"].set_visible(False)
ax0.tick_params(left=False, bottom=False)
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.text(0.5,0.5,
'Count plot for various\n categorical features\n_________________',
horizontalalignment='center',
verticalalignment='center',
fontsize=18, fontweight='bold',
fontfamily='serif',
color="#000000")
# Sex count
ax1.text(0.3, 220, 'Sex', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
sns.countplot(ax=ax1,data=df,x='sex',palette=color_palette)
ax1.set_xlabel("")
ax1.set_ylabel("")
# Exng count
ax2.text(0.3, 220, 'Exng', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
sns.countplot(ax=ax2,data=df,x='exng',palette=color_palette)
ax2.set_xlabel("")
ax2.set_ylabel("")
# Caa count
ax3.text(1.5, 200, 'Caa', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax3.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
sns.countplot(ax=ax3,data=df,x='caa',palette=color_palette)
ax3.set_xlabel("")
ax3.set_ylabel("")
# Cp count
ax4.text(1.5, 162, 'Cp', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax4.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
sns.countplot(ax=ax4,data=df,x='cp',palette=color_palette)
ax4.set_xlabel("")
ax4.set_ylabel("")
# Fbs count
ax5.text(0.5, 290, 'Fbs', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax5.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
sns.countplot(ax=ax5,data=df,x='fbs',palette=color_palette)
ax5.set_xlabel("")
ax5.set_ylabel("")
# Restecg count
ax6.text(0.75, 165, 'Restecg', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax6.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
sns.countplot(ax=ax6,data=df,x='restecg',palette=color_palette)
ax6.set_xlabel("")
ax6.set_ylabel("")
# Slp count
ax7.text(0.85, 155, 'Slp', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax7.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
sns.countplot(ax=ax7,data=df,x='slp',palette=color_palette)
ax7.set_xlabel("")
ax7.set_ylabel("")
# Thall count
ax8.text(1.2, 180, 'Thall', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax8.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
sns.countplot(ax=ax8,data=df,x='thall',palette=color_palette)
ax8.set_xlabel("")
ax8.set_ylabel("")
for s in ["top","right","left"]:
ax1.spines展开收缩.set_visible(False)
ax2.spines展开收缩.set_visible(False)
ax3.spines展开收缩.set_visible(False)
ax4.spines展开收缩.set_visible(False)
ax5.spines展开收缩.set_visible(False)
ax6.spines展开收缩.set_visible(False)
ax7.spines展开收缩.set_visible(False)
ax8.spines展开收缩.set_visible(False)
# boxen plot of continious features
fig = plt.figure(figsize=(18,16))
gs = fig.add_gridspec(2,3)
gs.update(wspace=0.3, hspace=0.15)
ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[0,2])
ax3 = fig.add_subplot(gs[1,0])
ax4 = fig.add_subplot(gs[1,1])
ax5 = fig.add_subplot(gs[1,2])
background_color = "#ffe6e6"
color_palette = ["#800000","#8000ff","#6aac90","#5833ff","#da8829"]
fig.patch.set_facecolor(background_color)
ax0.set_facecolor(background_color)
ax1.set_facecolor(background_color)
ax2.set_facecolor(background_color)
ax3.set_facecolor(background_color)
ax4.set_facecolor(background_color)
ax5.set_facecolor(background_color)
# Title of the plot
ax0.spines["bottom"].set_visible(False)
ax0.spines["left"].set_visible(False)
ax0.spines["top"].set_visible(False)
ax0.spines["right"].set_visible(False)
ax0.tick_params(left=False, bottom=False)
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.text(0.5,0.5,
'Boxen plot for various\n continuous features\n_________________',
horizontalalignment='center',
verticalalignment='center',
fontsize=18, fontweight='bold',
fontfamily='serif',
color="#000000")
# Age
ax1.text(-0.05, 81, 'Age', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
sns.boxenplot(ax=ax1,y=df['age'],palette=["#800000"],width=0.6)
ax1.set_xlabel("")
ax1.set_ylabel("")
# Trtbps
ax2.text(-0.05, 208, 'Trtbps', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
sns.boxenplot(ax=ax2,y=df['trtbps'],palette=["#8000ff"],width=0.6)
ax2.set_xlabel("")
ax2.set_ylabel("")
# Chol
ax3.text(-0.05, 600, 'Chol', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax3.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
sns.boxenplot(ax=ax3,y=df['chol'],palette=["#6aac90"],width=0.6)
ax3.set_xlabel("")
ax3.set_ylabel("")
# Thalachh
ax4.text(-0.0
答案3
得分: 1
df_raw.info()
df_raw.describe()
df_corr = df_raw.corr()[['Age']].sort_values(by='Age')
sns.heatmap(df_corr, annot=True)
plt.title('Histogram for xx')
plt.hist(x=df_raw['Age'])
df_raw['Age'].hist()
plt.show()
sns.boxplot(x=df_raw['SibSp'], y=df_raw['Pclass'])
counts = df_raw['SibSp'].value_counts()[0]
counts
df_clean = df_raw[df_raw['SibSp'] != 0]
df_clean
df_clean.reset_index(drop=True)
avg = df_clean['SibSp'].median()
df_raw['SibSp'].replace(0, avg, inplace=True)
df_raw['SibSp'].value_counts()
df_raw['SibSp'].fillna(avg, inplace=True)
df_raw['SibSp'].dropna()
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
df_clean = df_raw[['age', 'height_cm', 'weight_kg']].dropna()
df_predictor = df_clean[['height_cm', 'weight_kg']].copy()
df_target = df_clean['age'].copy()
lm.fit(df_predictor, df_target)
df_cln1 = df_raw.copy()
df_impute = pd.DataFrame(lm.predict(df_raw[['height_cm', 'weight_kg']]))
df_impute.rename({0: 'age'}, axis=1, inplace=True)
df_cln1.fillna(df_impute, inplace=True)
print(df_cln1.isna().sum())
from sklearn.preprocessing import OneHotEncoder
categorical_variables = ['sex', 'fracture', 'medication']
data_cat = df_raw[categorical_variables]
cat_encoder = OneHotEncoder()
data_cat_onehot = cat_encoder.fit_transform(data_cat)
data_cat_onehot_df = pd.DataFrame(data_cat_onehot.toarray())
df_temp = pd.concat([df_raw, data_cat_onehot_df], axis=1)
df_temp = df_temp.drop(categorical_variables, axis=1)
df_eng = df_temp.copy()
df_eng
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(df_raw.drop('Target', axis=1),
df_raw['Target'],
test_size=0.3,
random_state=99)
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
poly_feat = PolynomialFeatures(degree=2)
poly_mod_x = poly_feat.fit_transform(np.array(x_train))
poly_model = LinearRegression()
poly_model.fit(poly_mod_x, y_train)
poly_mod_x_val = poly_feat.fit_transform(np.array(x_valid))
y_pred = poly_model.predict(np.array(poly_mod_x_val))
rmse = np.sqrt(mse(y_valid, y_pred))
r2_score = r2_score(y_pred, y_valid)
print("Polynomial Regression Model Performance Metrics")
print('RMSE: ', rmse)
print('R2 : ', r2_score)
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)
y_pred_log = log_reg.predict(x_valid)
y_pred_lin = lin_reg.predict(x_valid)
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
report = classification_report(y_valid, y_pred_log)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred_lin))
r2 = r2_score(y_valid, y_pred_lin)
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
naive_bayes = GaussianNB()
random_forest = RandomForestClassifier()
decision_tree = DecisionTreeClassifier()
svm = SVC()
grid.best_params_
from sklearn.ensemble import VotingClassifier
estimator = [('Logistic Regression', LogisticRegression(random_state=99)),
('Decision Tree', DecisionTreeClassifier(random_state=99)),
('Random Forest', RandomForestClassifier(random_state=99)),
('SVM', SVC(C=1000, gamma=0.0001, kernel='rbf', probability=True, random_state=99))]
voting_model = VotingClassifier(estimators=estimator, voting='soft')
voting_model.fit(x_train, y_train)
pred = voting_model.predict(x_valid)
report = classification_report(y_valid, pred)
print(report)
from sklearn.ensemble import BaggingClassifier
bagging_model = BaggingClassifier(n_estimators=100, estimator=LogisticRegression())
bagging_model.fit(x_train, y_train)
pred = bagging_model.predict(x_valid)
report = classification_report(y_valid, pred)
from sklearn.ensemble import AdaBoostClassifier
adaboost_model = AdaBoostClassifier(n_estimators=100, estimator=RandomForestClassifier())
adaboost_model.fit(x_train, y_train)
pred = adaboost_model.pred(x_valid)
report = classification_report(y_valid, pred)
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
k_range = range(1, 10)
inertias = []
k_model = []
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=10)
kmeans.fit(df)
inertias.append(kmeans.inertia_)
k_model.append(kmeans)
plt.plot(list(k_range), inertias, "s-", linewidth=1)
plt.title('Elbow Chart')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.xticks(list(k_range))
plt.xlim(0, 9)
plt.ylim(0, 25000)
plt.grid(True)
plt.show()
from sklearn.metrics import silhouette_score
silhoutte_scores = [silhouette_score(df1, model.labels_) for model in k_model[1:]]
plt.plot(range(2, 10), silhoutte_scores, "o-")
plt.title("Silhouette Scores for k=1 to k=8")
plt.axis([1.5, 8.5, 0.4, 0.8])
plt.grid(True)
plt.xlim(0, 12)
plt.ylim(0, 1)
from sklearn.mixture import GaussianMixture
from sklearn.cluster import DBSCAN
kmeans = KMeans(n_clusters=4
<details>
<summary>英文:</summary>
df_raw.info()
df_raw.describe()
df_corr = df_raw.corr()[['Age']].sort_values(by = 'Age')
sns.heatmap(df_corr, annot = True)
plt.title('Histogram for xx')
plt.hist(x = df_raw['Age'])
df_raw['Age'].hist()
plt.show()
sns.boxplot(x = df_raw['SibSp'], y = df_raw['Pclass'])
counts = df_raw['SibSp'].value_counts()[0]
counts
df_clean = df_raw[df_raw['SibSp'] != 0]
df_clean
df_clean.reset_index(drop=True)
avg = df_clean['SibSp'].median()
df_raw['SibSp'].replace(0, avg, inplace = True)
df_raw['SibSp'].value_counts()
df_raw['SibSp'].fillna(avg, inplace = True)
df_raw['SibSp'].dropna()
linear reggression
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
df_clean = df_raw[['age', 'height_cm', 'weight_kg']].dropna()
df_predictor = df_clean[['height_cm', 'weight_kg']].copy()
df_target = df_clean['age'].copy()
lm.fit(df_predictor, df_target)
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
df_clean = df_raw[['age', 'height_cm', 'weight_kg']].dropna()
df_predictor = df_clean[['height_cm', 'weight_kg']].copy()
df_target = df_clean['age'].copy()
lm.fit(df_predictor, df_target)
df_cln1 = df_raw.copy()
df_impute = pd.DataFrame(lm.predict(df_raw[['height_cm', 'weight_kg']]))
df_impute.rename({0: 'age'}, axis = 1, inplace = True)
df_cln1.fillna(df_impute, inplace = True)
print(df_cln1.isna().sum())
feature engineering
from sklearn.preprocessing import OneHotEncoder
categorical_variables = ['sex', 'fracture', 'medication']
data_cat = df_raw[categorical_variables]
cat_encoder = OneHotEncoder()
data_cat_onehot = cat_encoder.fit_transform(data_cat)
data_cat_onehot_df = pd.DataFrame(data_cat_onehot.toarray())
df_temp = pd.concat([df_raw, data_cat_onehot_df], axis = 1)
df_temp = df_temp.drop(categorical_variables, axis = 1)
df_eng = df_temp.copy()
df_eng
df_raw = df_raw.dropna()
split data
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(df_raw.drop('Target', axis = 1),
df_raw['Target'],
test_size = 0.3,
random_state = 99)
Model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
poly_feat = PolynomialFeatures(degree = 2)
poly_mod_x = poly_feat.fit_transform(np.array(x_train))
poly_model = LinearRegression()
poly_model.fit(poly_mod_x, y_train)
poly_mod_x_val = poly_feat.fit_transform(np.array(x_valid))
y_pred = poly_model.predict(np.array(poly_mod_x_val))
rmse = np.sqrt(mse(y_valid, y_pred))
r2_score = r2_score(y_pred, y_valid)
print("Polynomial Regression Model Performance Metrics")
print('RMSE: ', rmse)
print('R2 : ', r2_score)
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)
y_pred_log = log_reg.predict(x_valid)
y_pred_lin = lin_reg.predict(x_valid)
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
report = classification_report(y_valid, y_pred_log)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred_lin))
r2 = r2_score(y_valid, y_pred_lin)
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
naive_bayes = GaussianNB()
random_forest = RandomForestClassifier()
decision_tree = DecisionTreeClassifier()
svm = SVC()
grid.best_params_
from sklearn.ensemble import VotingClassifier
estimator = [('Logistic Regression', LogisticRegression(random_state = 99)),
('Decision Tree', DecisionTreeClassifier(random_state = 99)),
('Random Forest', RandomForestClassifier(random_state = 99)),
('SVM', SVC(C = 1000, gamma = 0.0001, kernel = 'rbf', probability = True, random_state = 99))]
voting_model = VotingClassifier(estimators = estimator,
voting = 'soft')
voting_model.fit(x_train, y_train)
pred = voting_model.predict(x_valid)
report = classification_report(y_valid, pred)
print(report)
from sklearn.ensemble import BaggingClassifier
bagging_model = BaggingClassifier(n_estimators = 100,
estimator = LogisticRegression())
bagging_model.fit(x_train, y_train)
pred = bagging_model.predict(x_valid)
report = classification_report(y_valid, pred)
from sklearn.ensemble import AdaBoostClassifier
adaboost_model = AdaBoostClassifier(n_estimators = 100,
estimator =
RandomForestClassifier())
adaboost_model.fit(x_train, y_train)
pred = adaboost_model.pred(x_valid)
report = classification_report(y_valid, pred)
Clustering KMeans
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
k_range = range(1, 10)
inertias = []
k_model = []
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=10)
kmeans.fit(df)
inertias.append(kmeans.inertia_)
k_model.append(kmeans)
plt.plot(list(k_range), inertias, "s-", linewidth=1)
plt.title('Elbow Chart')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.xticks(list(k_range))
plt.xlim(0, 9)
plt.ylim(0, 25000)
plt.grid(True)
plt.show()
from sklearn.metrics import silhouette_score
silhoutte_scores = [silhouette_score(df1, model.labels_) for model in k_model[1:]]
plt.plot(range(2, 10), silhoutte_scores, "o-")
plt.title("Silhoutte Scores for k=1 to k=8")
plt.axis([1.5, 8.5, 0.4, 0.8])
plt.grid(True)
plt.xlim(0, 12)
plt.ylim(0,1)
from sklearn.mixture import GaussianMixture
from sklearn.cluster import DBSCAN
kmeans = KMeans(n_clusters = 4)
kmeans.fit(df_raw)
y_kmeans = kmeans.predict(df_raw)
gaussian = GaussianMixture(n_components = 4)
gaussian.fit(df_raw)
y_gaussian = gaussian.predict(df_raw)
dbscan = DBSCAN(eps = 1.3, min_samples = 4)
dbscan.fit(df_raw)
y_dbscan = DBSCAN.predict(df_raw)
#standardize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_standardize = scaler.fit_transform(df_raw.drop('Outcome', axis = 1))
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(x_standardize)
x_pca = pca.transform(x_standardize)
variance = pca.explained_variance_ratio
sns.barplot(x = list(range(1, len(variance)+1)), y = variance)
variance
#pca
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_standardize = scaler.fit_transform(x)
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(x_standardize)
x_pca = pca.transform(x_standardize)
cat = ['sex', 'age', 'BMI']
df = df_raw.dropna().copy()
df_pred = df_raw[cat]
df_resp = df_raw[['healthy']]
lin_reg = LinearRegression()
lin_reg.fit(df_pred, df_resp)
df_new = df['healthy'].fillna(lin_reg.predict(df_raw[cat]))
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.cluster import DBSCAN
from sklearn.naive_bayes import GaussianNB
kmeans = KMeans(n_clusters = 5)
kmeans.fit(df_raw)
y_kmeans = kmeans.predict(df_raw)
gaus_mix = GaussianMixture(n_components = 5)
gaus_mix.fit(df_raw)
y_gaus = gaus_mix.predict(df_raw)
dbscan = DBSCAN(eps=1.2)
dbscan.fit(df_raw)
y_dbscan = dbscan.predict(df_raw)
naive_bayes = GaussianNB()
naive_bayes.fit(x_train, y_train)
y_pred = naive_bayes.predict(x_valid)
report = classification_report(y_valid, y_pred)
df_corr = df_raw.corr()[['Outcome']].sort_values(by = 'Outcome')
sns.heatmap(df_corr, annot = True)
sns.scatterplot(x = , y = , hue = )
inertia = []
K = range(1, 10)
for i in K:
kmeans = KMeans(n_clsuters = i)
kmeans.fit(df_raw)
inertia.append(kmeans.inertia_)
plt.plot(K, inertia)
plt.xlabel('K')
plt.ylabel('Inertia')
plt.show()
numeric_columns = list(df_raw.select_dtypes(include = [np.number]).columns)
for i, col in enumerate(numeric_columns):
plt.figure()
sns.boxplot(data = df_raw, x = 'Target', y = col)
from sklearn.preprocessing import OneHotEncoder
cat = ['Sex', 'Embarked']
df_cat = df_raw[cat]
df_encoded = pd.get_dummies(df_cat)
df_new = pd.concat([df_raw, df_encoded])
df_new = df_new.drop(cat, axis = 1)
from sklearn.model_selection import GridSearchCV
params_grid = {'C': [1, 10 , 100, 1000],
'gamma' : [0.00001, 0.001, 0.01, 0.1],
'kernel': ['rbf']}
grid = GridSearchCV(SVC(), params_grid, refit = True, verbose = 1)
grid.best_params_
from sklearn.preprocessing import PolynomialFeatures
#still broken
poly_reg = PolynomialFeatures(degree = 2)
lm.fit(poly_reg, y)
x_val_poly = poly_reg(x_val)
</details>
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论