KNN回归算法

huangapple go评论53阅读模式
英文:

Knn algorithm in regression

问题

我有一个数据集,我正在尝试使用KNN来基于“Number of plants”来预测“Total kg”。

from sklearn.neighbors import KNeighborsClassifier

df = pd.read_csv(r'final2.csv')

X = df.drop('Total kg', axis=1)
Y = df[["Total kg"]]

X = X.values
Y = Y.values

knn = KNeighborsClassifier(n_neighbors=1)

prediction = knn.fit(X, Y)

我收到以下错误:ValueError: 未知的标签类型:'continuous'。

有没有办法使用KNN来处理连续变量?

英文:
Number of plants Total kg
900 7.565
1440 17.808
2340 25.373
900 10.340
900 10.610

I have a data set and I am trying to predict the "Total kg" based on the "Number of plants" using KNN.

from sklearn.neighbors import KNeighborsClassifier

df = pd.read_csv(r'final2.csv')

X = df.drop('Total kg', axis=1)
Y = df[["Total kg"]]

X=X.values
Y=Y.values

knn = KNeighborsClassifier(n_neighbors=1)
  
prediction = knn.fit(X,Y)

I get the following error: ValueError: Unknown label type: 'continuous'

Is there a way to use KNN for continuous variables?

答案1

得分: 1

不应在回归任务中使用分类器。尝试:

knn = KNeighborsRegressor(n_neighbors=1)
英文:

You should not use a classifier for a regression task. Try:

knn = KNeighborsRegressor(n_neighbors=1)

答案2

得分: 1

import pandas as pd
import import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

Scaling

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing from LabelEncoder

Train Test Split

from sklearn.model_selection import train_test_split

Models

import torch
import torch.nn as nn
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

Metrics

from sklearn.metrics import accuracy_score, classification_report, roc_curve

Cross Validation

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("heart.csv")

Preview of the first 10 rows of the data

df.head(10)

df.dtypes

Shape of data

print("The shape of the dataset is:", df.shape)

Get Categorical Column

string_col = df.select_dtypes("string").columns.to_list()

Check the number of unique values in each column

dict = {}
for i in list(df.columns):
dict[i] = df[i].value_counts().shape[0]

pd.DataFrame(dict, index=["unique count"]).transpose()

Check duplicated

df[df.duplicated()]

Remove duplicate

df.drop_duplicates(keep='first', inplace=True)

Check new shape

print('Number of rows are', df.shape[0], 'and number of columns are', df.shape[1])

num_col = df.columns.to_list()
for col in string_col:
num_col.remove(col)
num_col.remove("output")

df[string_col].head()
for col in string_col:
print(f"The distribution of categorical values in the {col} is:")
print(df[col].value_counts())

Check statistical data

df.describe()
df.coor()

Separate the column in categorical and continuous

cat_cols = ['sex', 'exng', 'caa', 'cp', 'fbs', 'restecg', 'slp', 'thall']
con_cols = ["age", "trtbps", "chol", "thalachh", "oldpeak"]
target_col = ["output"]
print("The categorical cols are:", cat_cols)
print("The continuous cols are:", con_cols)
print("The target variable is:", target_col)

df[con_cols].describe().transpose()

EDA

Pair plot

plt.figure(figsize=(20, 20))
sns.pairplot(df, hue='output', palette=["#8000ff", "#da8829"])
plt.show()

Violin plot

plt.figure(figsize=(18, 10))
plt.subplot(2, 3, 1)
sns.violinplot(x='sex', y='output', data=df)
plt.xticks(rotation=45)

plt.subplot(2, 3, 2)
sns.violinplot(x='thall', y='output', data=df)
plt.xticks(rotation=45)

plt.subplot(2, 3, 3)
sns.violinplot(x='exng', y='output', data=df)
plt.xticks(rotation=45)

plt.subplot(2, 3, 4)
sns.violinplot(x='restecg', y='output', data=df)
plt.xticks(rotation=45)

plt.subplot(2, 3, 5)
sns.violinplot(x='cp', y='output', data=df)
plt.xticks(rotation=45)

plt.subplot(2, 3, 6)
sns.violinplot(x='fbs', y='output', data=df)
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

Heatmap

px.imshow(df.corr(), title="Correlation Plot of the Heart Failure Prediction")

plt.figure(figsize=(16, 8))
sns.heatmap(df.corr(), annot=True, cmap='gnuplot2_r', fmt='.1f)

Count plot of categorical features

fig = plt.figure(figsize=(18, 15))
gs = fig.add_gridspec(3, 3)
gs.update(wspace=0.5, hspace=0.25)
ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])
ax3 = fig.add_subplot(gs[1, 0])
ax4 = fig.add_subplot(gs[1, 1])
ax5 = fig.add_subplot(gs[1, 2])
ax6 = fig.add_subplot(gs[2, 0])
ax7 = fig.add_subplot(gs[2, 1])
ax8 = fig.add_subplot(gs[2, 2])

background_color = "#ffe6e6"
color_palette = ["#800000", "#8000ff", "#6aac90", "#5833ff", "#da8829"]
fig.patch.set_facecolor(background_color)
ax0.set_facecolor(background_color)
ax1.set_facecolor(background_color)
ax2.set_facecolor(background_color)
ax3.set_facecolor(background_color)
ax4.set_facecolor(background_color)
ax5.set_facecolor(background_color)
ax6.set_facecolor(background_color)
ax7.set_facecolor(background_color)
ax8.set_facecolor(background_color)

Title of the plot

ax0.spines["bottom"].set_visible(False)
ax0.spines["left"].set_visible(False)
ax0.spines["top"].set_visible(False)
ax0.spines["right"].set_visible(False)
ax0.tick_params(left=False, bottom=False)
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.text(0.5, 0.5,
'Count plot for various\n categorical features\n_________________',
horizontalalignment='center',
verticalalignment='center',
fontsize=18, fontweight='bold',
fontfamily='serif',
color="#000000")

Sex count

ax1.text(0.3, 220, 'Sex', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1, 5))
sns.countplot(ax=ax1, data=df, x='sex', palette=color_palette)
ax1.set_xlabel("")
ax1.set_ylabel("")

Exng count

ax2.text(0.3, 220, 'Exng', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1, 5))
sns.countplot(ax=ax2, data=df, x='exng', palette=color_palette)
ax2.set_xlabel("")
ax2.set_ylabel("")

Caa count

ax3.text(1.5, 200, 'Caa', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax3.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1, 5))
sns.countplot(ax=ax3, data=df, x='caa', palette=color_palette)
ax3.set_xlabel("")
ax3.set_ylabel("")

英文:
    import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Scaling
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder
# Train Test Split
from sklearn.model_selection import train_test_split
# Models
import torch
import torch.nn as nn
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Metrics
from sklearn.metrics import accuracy_score, classification_report, roc_curve
# Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("heart.csv")
# Preview of the first 10 rows of the data
df.head(10)
df.dtypes
#shape of data
print("The shape of the dataset is : ", df.shape)
**Get Categorical Column**
string_col=df.select_dtypes("string").columns.to_list()
**#Check the number of unique values in each column**
dict = {}
for i in list(df.columns):
dict[i] = df[i].value_counts().shape[0]
pd.DataFrame(dict,index=["unique count"]).transpose()
**check duplicated**
df[df.duplicated()]
**Remove duplicate**
df.drop_duplicates(keep='first',inplace=True)
**check new shape**
print('Number of rows are',df.shape[0], 'and number of columns are ',df.shape[1])
num_col=df.columns.to_list()
for col in string_col:
num_col.remove(col)
num_col.remove("output")
df[string_col].head()
for col in string_col:
print(f"The distribution of categorical valeus in the {col} is : ")
print(df[col].value_counts())
**check statistical data**
df.describe()
df.coor()
**seperate the column in categorical and continious**
cat_cols = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
con_cols = ["age","trtbps","chol","thalachh","oldpeak"]
target_col = ["output"]
print("The categorial cols are : ", cat_cols)
print("The continuous cols are : ", con_cols)
print("The target variable is :  ", target_col)
df[con_cols].describe().transpose()
# EDA#
pair plot
plt.figure(figsize=(20, 20))
sns.pairplot(df,hue='output',palette = ["#8000ff","#da8829"])
plt.show()
violin plot
plt.figure(figsize=(18, 10))
plt.subplot(2,3,1)
sns.violinplot(x = 'sex', y = 'output', data = df)
plt.xticks(rotation=45)
plt.subplot(2,3,2)
sns.violinplot(x = 'thall', y = 'output', data = df)
plt.xticks(rotation=45)
plt.subplot(2,3,3)
sns.violinplot(x = 'exng', y = 'output', data = df)
plt.xticks(rotation=45)
plt.subplot(2,3,4)
sns.violinplot(x = 'restecg', y = 'output', data = df)
plt.xticks(rotation=45)
plt.subplot(2,3,5)
sns.violinplot(x = 'cp', y = 'output', data = df)
plt.xticks(rotation=45)
plt.subplot(2,3,6)
sns.violinplot(x = 'fbs', y = 'output', data = df)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
heatmap
px.imshow(df.corr(),title="Correlation Plot of the Heat Failure Prediction")
plt.figure(figsize= (16, 8))
sns.heatmap(df.corr(), annot = True, cmap= 'gnuplot2_r', fmt= '.1f');
# Count plot of categorical features#
fig = plt.figure(figsize=(18,15))
gs = fig.add_gridspec(3,3)
gs.update(wspace=0.5, hspace=0.25)
ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[0,2])
ax3 = fig.add_subplot(gs[1,0])
ax4 = fig.add_subplot(gs[1,1])
ax5 = fig.add_subplot(gs[1,2])
ax6 = fig.add_subplot(gs[2,0])
ax7 = fig.add_subplot(gs[2,1])
ax8 = fig.add_subplot(gs[2,2])
background_color = "#ffe6e6"
color_palette = ["#800000","#8000ff","#6aac90","#5833ff","#da8829"]
fig.patch.set_facecolor(background_color)
ax0.set_facecolor(background_color)
ax1.set_facecolor(background_color)
ax2.set_facecolor(background_color)
ax3.set_facecolor(background_color)
ax4.set_facecolor(background_color)
ax5.set_facecolor(background_color)
ax6.set_facecolor(background_color)
ax7.set_facecolor(background_color)
ax8.set_facecolor(background_color)
# Title of the plot
ax0.spines["bottom"].set_visible(False)
ax0.spines["left"].set_visible(False)
ax0.spines["top"].set_visible(False)
ax0.spines["right"].set_visible(False)
ax0.tick_params(left=False, bottom=False)
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.text(0.5,0.5,
'Count plot for various\n categorical features\n_________________',
horizontalalignment='center',
verticalalignment='center',
fontsize=18, fontweight='bold',
fontfamily='serif',
color="#000000")
# Sex count
ax1.text(0.3, 220, 'Sex', fontsize=14, fontweight='bold', fontfamily='serif',     color="#000000")
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(ax=ax1,data=df,x='sex',palette=color_palette)
ax1.set_xlabel("")
ax1.set_ylabel("")
# Exng count
ax2.text(0.3, 220, 'Exng', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(ax=ax2,data=df,x='exng',palette=color_palette)
ax2.set_xlabel("")
ax2.set_ylabel("")
# Caa count
ax3.text(1.5, 200, 'Caa', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax3.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(ax=ax3,data=df,x='caa',palette=color_palette)
ax3.set_xlabel("")
ax3.set_ylabel("")
# Cp count
ax4.text(1.5, 162, 'Cp', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax4.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(ax=ax4,data=df,x='cp',palette=color_palette)
ax4.set_xlabel("")
ax4.set_ylabel("")
# Fbs count
ax5.text(0.5, 290, 'Fbs', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax5.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(ax=ax5,data=df,x='fbs',palette=color_palette)
ax5.set_xlabel("")
ax5.set_ylabel("")
# Restecg count
ax6.text(0.75, 165, 'Restecg', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax6.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(ax=ax6,data=df,x='restecg',palette=color_palette)
ax6.set_xlabel("")
ax6.set_ylabel("")
# Slp count
ax7.text(0.85, 155, 'Slp', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax7.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(ax=ax7,data=df,x='slp',palette=color_palette)
ax7.set_xlabel("")
ax7.set_ylabel("")
# Thall count
ax8.text(1.2, 180, 'Thall', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax8.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(ax=ax8,data=df,x='thall',palette=color_palette)
ax8.set_xlabel("")
ax8.set_ylabel("")
for s in ["top","right","left"]:
ax1.spines
展开收缩
.set_visible(False) ax2.spines
展开收缩
.set_visible(False) ax3.spines
展开收缩
.set_visible(False) ax4.spines
展开收缩
.set_visible(False) ax5.spines
展开收缩
.set_visible(False) ax6.spines
展开收缩
.set_visible(False) ax7.spines
展开收缩
.set_visible(False) ax8.spines
展开收缩
.set_visible(False) # boxen plot of continious features fig = plt.figure(figsize=(18,16)) gs = fig.add_gridspec(2,3) gs.update(wspace=0.3, hspace=0.15) ax0 = fig.add_subplot(gs[0,0]) ax1 = fig.add_subplot(gs[0,1]) ax2 = fig.add_subplot(gs[0,2]) ax3 = fig.add_subplot(gs[1,0]) ax4 = fig.add_subplot(gs[1,1]) ax5 = fig.add_subplot(gs[1,2]) background_color = "#ffe6e6" color_palette = ["#800000","#8000ff","#6aac90","#5833ff","#da8829"] fig.patch.set_facecolor(background_color) ax0.set_facecolor(background_color) ax1.set_facecolor(background_color) ax2.set_facecolor(background_color) ax3.set_facecolor(background_color) ax4.set_facecolor(background_color) ax5.set_facecolor(background_color) # Title of the plot ax0.spines["bottom"].set_visible(False) ax0.spines["left"].set_visible(False) ax0.spines["top"].set_visible(False) ax0.spines["right"].set_visible(False) ax0.tick_params(left=False, bottom=False) ax0.set_xticklabels([]) ax0.set_yticklabels([]) ax0.text(0.5,0.5, 'Boxen plot for various\n continuous features\n_________________', horizontalalignment='center', verticalalignment='center', fontsize=18, fontweight='bold', fontfamily='serif', color="#000000") # Age ax1.text(-0.05, 81, 'Age', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000") ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5)) sns.boxenplot(ax=ax1,y=df['age'],palette=["#800000"],width=0.6) ax1.set_xlabel("") ax1.set_ylabel("") # Trtbps ax2.text(-0.05, 208, 'Trtbps', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000") ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5)) sns.boxenplot(ax=ax2,y=df['trtbps'],palette=["#8000ff"],width=0.6) ax2.set_xlabel("") ax2.set_ylabel("") # Chol ax3.text(-0.05, 600, 'Chol', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000") ax3.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5)) sns.boxenplot(ax=ax3,y=df['chol'],palette=["#6aac90"],width=0.6) ax3.set_xlabel("") ax3.set_ylabel("") # Thalachh ax4.text(-0.0

答案3

得分: 1

    df_raw.info()
    df_raw.describe()
    
    df_corr = df_raw.corr()[['Age']].sort_values(by='Age')
    sns.heatmap(df_corr, annot=True)
    
    plt.title('Histogram for xx')
    plt.hist(x=df_raw['Age'])
    df_raw['Age'].hist()
    plt.show()
    
    sns.boxplot(x=df_raw['SibSp'], y=df_raw['Pclass'])
    
    counts = df_raw['SibSp'].value_counts()[0]
    counts
    
    df_clean = df_raw[df_raw['SibSp'] != 0]
    df_clean
    
    df_clean.reset_index(drop=True)
    
    avg = df_clean['SibSp'].median()
    df_raw['SibSp'].replace(0, avg, inplace=True)
    df_raw['SibSp'].value_counts()
    
    df_raw['SibSp'].fillna(avg, inplace=True)
    df_raw['SibSp'].dropna()
    from sklearn.linear_model import LinearRegression
    
    lm = LinearRegression()
    
    df_clean = df_raw[['age', 'height_cm', 'weight_kg']].dropna()
    df_predictor = df_clean[['height_cm', 'weight_kg']].copy()
    df_target = df_clean['age'].copy()
    
    lm.fit(df_predictor, df_target)
    
    df_cln1 = df_raw.copy()
    df_impute = pd.DataFrame(lm.predict(df_raw[['height_cm', 'weight_kg']]))
    df_impute.rename({0: 'age'}, axis=1, inplace=True)
    df_cln1.fillna(df_impute, inplace=True)
    print(df_cln1.isna().sum())
    from sklearn.preprocessing import OneHotEncoder
    
    categorical_variables = ['sex', 'fracture', 'medication']
    data_cat = df_raw[categorical_variables]
    
    cat_encoder = OneHotEncoder()
    data_cat_onehot = cat_encoder.fit_transform(data_cat)
    data_cat_onehot_df = pd.DataFrame(data_cat_onehot.toarray())
    
    df_temp = pd.concat([df_raw, data_cat_onehot_df], axis=1)
    df_temp = df_temp.drop(categorical_variables, axis=1)
    df_eng = df_temp.copy()
    df_eng
    from sklearn.model_selection import train_test_split
    
    x_train, x_valid, y_train, y_valid = train_test_split(df_raw.drop('Target', axis=1),
                                                          df_raw['Target'],
                                                          test_size=0.3,
                                                          random_state=99)
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.metrics import mean_squared_error as mse
    from sklearn.metrics import r2_score
    
    poly_feat = PolynomialFeatures(degree=2)
    
    poly_mod_x = poly_feat.fit_transform(np.array(x_train))
    poly_model = LinearRegression()
    poly_model.fit(poly_mod_x, y_train)
    
    poly_mod_x_val = poly_feat.fit_transform(np.array(x_valid))
    y_pred = poly_model.predict(np.array(poly_mod_x_val))
    rmse = np.sqrt(mse(y_valid, y_pred))
    r2_score = r2_score(y_pred, y_valid)
    
    print("Polynomial Regression Model Performance Metrics")
    print('RMSE: ', rmse)
    print('R2  : ', r2_score)
    from sklearn.linear_model import LogisticRegression
    from sklearn.linear_model import LinearRegression
    
    log_reg = LogisticRegression()
    log_reg.fit(x_train, y_train)
    lin_reg = LinearRegression()
    lin_reg.fit(x_train, y_train)
    y_pred_log = log_reg.predict(x_valid)
    y_pred_lin = lin_reg.predict(x_valid)
    from sklearn.metrics import classification_report
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import r2_score
    
    report = classification_report(y_valid, y_pred_log)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred_lin))
    r2 = r2_score(y_valid, y_pred_lin)
    from sklearn.naive_bayes import GaussianNB
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.svm import SVC
    
    naive_bayes = GaussianNB()
    random_forest = RandomForestClassifier()
    decision_tree = DecisionTreeClassifier()
    svm = SVC()
    
    grid.best_params_
    
    from sklearn.ensemble import VotingClassifier
    
    estimator = [('Logistic Regression', LogisticRegression(random_state=99)),
                 ('Decision Tree', DecisionTreeClassifier(random_state=99)),
                 ('Random Forest', RandomForestClassifier(random_state=99)),
                 ('SVM', SVC(C=1000, gamma=0.0001, kernel='rbf', probability=True, random_state=99))]
    
    voting_model = VotingClassifier(estimators=estimator, voting='soft')
    voting_model.fit(x_train, y_train)
    pred = voting_model.predict(x_valid)
    report = classification_report(y_valid, pred)
    print(report)
    from sklearn.ensemble import BaggingClassifier
    
    bagging_model = BaggingClassifier(n_estimators=100, estimator=LogisticRegression())
    bagging_model.fit(x_train, y_train)
    pred = bagging_model.predict(x_valid)
    report = classification_report(y_valid, pred)
    from sklearn.ensemble import AdaBoostClassifier
    
    adaboost_model = AdaBoostClassifier(n_estimators=100, estimator=RandomForestClassifier())
    adaboost_model.fit(x_train, y_train)
    pred = adaboost_model.pred(x_valid)
    report = classification_report(y_valid, pred)
    from sklearn.cluster import KMeans
    import matplotlib.pyplot as plt
    
    k_range = range(1, 10)
    inertias = []
    k_model = []
    
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=10)
        kmeans.fit(df)
        inertias.append(kmeans.inertia_)
        k_model.append(kmeans)
    
    plt.plot(list(k_range), inertias, "s-", linewidth=1)
    plt.title('Elbow Chart')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Inertia')
    plt.xticks(list(k_range))
    plt.xlim(0, 9)
    plt.ylim(0, 25000)
    plt.grid(True)
    plt.show()
    from sklearn.metrics import silhouette_score
    silhoutte_scores = [silhouette_score(df1, model.labels_) for model in k_model[1:]]
    
    plt.plot(range(2, 10), silhoutte_scores, "o-")
    plt.title("Silhouette Scores for k=1 to k=8")
    plt.axis([1.5, 8.5, 0.4, 0.8])
    plt.grid(True)
    plt.xlim(0, 12)
    plt.ylim(0, 1)
    from sklearn.mixture import GaussianMixture
    from sklearn.cluster import DBSCAN
    kmeans = KMeans(n_clusters=4

<details>
<summary>英文:</summary>

    df_raw.info()
    df_raw.describe()
    
    df_corr = df_raw.corr()[[&#39;Age&#39;]].sort_values(by = &#39;Age&#39;)
    sns.heatmap(df_corr, annot = True)
    
    plt.title(&#39;Histogram for xx&#39;)
    plt.hist(x = df_raw[&#39;Age&#39;])
    df_raw[&#39;Age&#39;].hist()
    plt.show()
    
    sns.boxplot(x = df_raw[&#39;SibSp&#39;], y = df_raw[&#39;Pclass&#39;])
    
    counts = df_raw[&#39;SibSp&#39;].value_counts()[0]
    counts
    
    df_clean = df_raw[df_raw[&#39;SibSp&#39;] != 0]
    df_clean
    
    df_clean.reset_index(drop=True)
    
    avg = df_clean[&#39;SibSp&#39;].median()
    df_raw[&#39;SibSp&#39;].replace(0, avg, inplace = True)
    df_raw[&#39;SibSp&#39;].value_counts()
    
    df_raw[&#39;SibSp&#39;].fillna(avg, inplace = True)
    df_raw[&#39;SibSp&#39;].dropna()
    
linear reggression

    from sklearn.linear_model import LinearRegression
    
    lm = LinearRegression()
    
    df_clean = df_raw[[&#39;age&#39;, &#39;height_cm&#39;, &#39;weight_kg&#39;]].dropna()
    df_predictor = df_clean[[&#39;height_cm&#39;, &#39;weight_kg&#39;]].copy()
    df_target = df_clean[&#39;age&#39;].copy()
    
    lm.fit(df_predictor, df_target)

    from sklearn.linear_model import LinearRegression

    lm = LinearRegression()
    
    df_clean = df_raw[[&#39;age&#39;, &#39;height_cm&#39;, &#39;weight_kg&#39;]].dropna()
    df_predictor = df_clean[[&#39;height_cm&#39;, &#39;weight_kg&#39;]].copy()
    df_target = df_clean[&#39;age&#39;].copy()
    
    lm.fit(df_predictor, df_target)

    df_cln1 = df_raw.copy()
    df_impute = pd.DataFrame(lm.predict(df_raw[[&#39;height_cm&#39;, &#39;weight_kg&#39;]]))
    df_impute.rename({0: &#39;age&#39;}, axis = 1, inplace = True)
    df_cln1.fillna(df_impute, inplace = True)
    print(df_cln1.isna().sum())

feature engineering

    from sklearn.preprocessing import OneHotEncoder

    categorical_variables = [&#39;sex&#39;, &#39;fracture&#39;, &#39;medication&#39;]
    data_cat = df_raw[categorical_variables]
    
    cat_encoder = OneHotEncoder()
    data_cat_onehot = cat_encoder.fit_transform(data_cat)
    data_cat_onehot_df = pd.DataFrame(data_cat_onehot.toarray())
    
    df_temp = pd.concat([df_raw, data_cat_onehot_df], axis = 1)
    df_temp = df_temp.drop(categorical_variables, axis = 1)
    df_eng = df_temp.copy()
    df_eng

    df_raw = df_raw.dropna()

split data

    from sklearn.model_selection import train_test_split

    x_train, x_valid, y_train, y_valid = train_test_split(df_raw.drop(&#39;Target&#39;, axis = 1),
                                                          df_raw[&#39;Target&#39;],
                                                          test_size = 0.3,
                                                          random_state = 99)

Model

    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.metrics import mean_squared_error as mse
    from sklearn.metrics import r2_score
    
    poly_feat = PolynomialFeatures(degree = 2)
    
    poly_mod_x = poly_feat.fit_transform(np.array(x_train))
    poly_model = LinearRegression()
    poly_model.fit(poly_mod_x, y_train)
    
    poly_mod_x_val =  poly_feat.fit_transform(np.array(x_valid))
    y_pred = poly_model.predict(np.array(poly_mod_x_val))
    rmse = np.sqrt(mse(y_valid, y_pred))
    r2_score = r2_score(y_pred, y_valid)
    
    print(&quot;Polynomial Regression Model Performance Metrics&quot;)
    print(&#39;RMSE: &#39;, rmse)
    print(&#39;R2  : &#39;, r2_score)

    from sklearn.linear_model import LogisticRegression
    from sklearn.linear_model import LinearRegression
    
    log_reg = LogisticRegression()
    log_reg.fit(x_train, y_train)
    lin_reg = LinearRegression()
    lin_reg.fit(x_train, y_train)
    y_pred_log = log_reg.predict(x_valid)
    y_pred_lin = lin_reg.predict(x_valid)

    from sklearn.metrics import classification_report
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import r2_score
    
    report = classification_report(y_valid, y_pred_log)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred_lin))
    r2 = r2_score(y_valid, y_pred_lin)

    from sklearn.naive_bayes import GaussianNB

    from sklearn.ensemble import RandomForestClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.svm import SVC
    
    naive_bayes = GaussianNB()
    random_forest = RandomForestClassifier()
    decision_tree = DecisionTreeClassifier()
    svm = SVC()

    grid.best_params_
    
    from sklearn.ensemble import VotingClassifier
    
    estimator = [(&#39;Logistic Regression&#39;, LogisticRegression(random_state = 99)),
                 (&#39;Decision Tree&#39;, DecisionTreeClassifier(random_state = 99)),
                 (&#39;Random Forest&#39;, RandomForestClassifier(random_state = 99)),
                 (&#39;SVM&#39;, SVC(C = 1000, gamma = 0.0001, kernel = &#39;rbf&#39;, probability = True, random_state = 99))]
    
    voting_model = VotingClassifier(estimators = estimator,
                                    voting = &#39;soft&#39;)
    voting_model.fit(x_train, y_train)
    pred = voting_model.predict(x_valid)
    report = classification_report(y_valid, pred)
    print(report)

    from sklearn.ensemble import BaggingClassifier
    
    bagging_model = BaggingClassifier(n_estimators = 100,
                                      estimator = LogisticRegression())
    bagging_model.fit(x_train, y_train)
    pred = bagging_model.predict(x_valid)
    report = classification_report(y_valid, pred)

    from sklearn.ensemble import AdaBoostClassifier
    
    adaboost_model = AdaBoostClassifier(n_estimators = 100,
                                        estimator = 
    RandomForestClassifier())
    adaboost_model.fit(x_train, y_train)
    pred = adaboost_model.pred(x_valid)
    report = classification_report(y_valid, pred)

Clustering KMeans

    from sklearn.cluster import KMeans
    import matplotlib.pyplot as plt
    
    k_range = range(1, 10)
    inertias = []
    k_model = []
    
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=10)
        kmeans.fit(df)
        inertias.append(kmeans.inertia_)
        k_model.append(kmeans)
    
    plt.plot(list(k_range), inertias, &quot;s-&quot;, linewidth=1)
    plt.title(&#39;Elbow Chart&#39;)
    plt.xlabel(&#39;Number of Clusters (k)&#39;)
    plt.ylabel(&#39;Inertia&#39;)
    plt.xticks(list(k_range))
    plt.xlim(0, 9)
    plt.ylim(0, 25000)
    plt.grid(True)
    plt.show()

    from sklearn.metrics import silhouette_score
    silhoutte_scores = [silhouette_score(df1, model.labels_) for model in k_model[1:]]
    
    plt.plot(range(2, 10), silhoutte_scores, &quot;o-&quot;)
    plt.title(&quot;Silhoutte Scores for k=1 to k=8&quot;)
    plt.axis([1.5, 8.5, 0.4, 0.8])
    plt.grid(True)
    plt.xlim(0, 12)
    plt.ylim(0,1)

    from sklearn.mixture import GaussianMixture
    from sklearn.cluster import DBSCAN
    kmeans = KMeans(n_clusters = 4)
    kmeans.fit(df_raw)
    y_kmeans = kmeans.predict(df_raw)
    
    gaussian = GaussianMixture(n_components = 4)
    gaussian.fit(df_raw)
    y_gaussian = gaussian.predict(df_raw)
    
    dbscan = DBSCAN(eps = 1.3, min_samples = 4)
    dbscan.fit(df_raw)
    y_dbscan = DBSCAN.predict(df_raw)

    #standardize
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    x_standardize = scaler.fit_transform(df_raw.drop(&#39;Outcome&#39;, axis = 1))

    from sklearn.decomposition import PCA
    
    pca = PCA()
    pca.fit(x_standardize)
    x_pca = pca.transform(x_standardize)

    variance = pca.explained_variance_ratio
    sns.barplot(x = list(range(1, len(variance)+1)), y = variance)
    variance

    #pca
    from sklearn.preprocessing import StandardScaler
    
    scaler = StandardScaler()
    x_standardize = scaler.fit_transform(x)

    from sklearn.decomposition import PCA
    
    pca = PCA()
    pca.fit(x_standardize)
    x_pca = pca.transform(x_standardize)

    cat = [&#39;sex&#39;, &#39;age&#39;, &#39;BMI&#39;]
    df = df_raw.dropna().copy()
    df_pred = df_raw[cat]
    df_resp = df_raw[[&#39;healthy&#39;]]
    
    lin_reg = LinearRegression()
    lin_reg.fit(df_pred, df_resp)
    df_new = df[&#39;healthy&#39;].fillna(lin_reg.predict(df_raw[cat]))

    from sklearn.cluster import KMeans
    from sklearn.mixture import GaussianMixture
    from sklearn.cluster import DBSCAN
    from sklearn.naive_bayes import GaussianNB
    
    kmeans = KMeans(n_clusters = 5)
    kmeans.fit(df_raw)
    y_kmeans = kmeans.predict(df_raw)
    
    gaus_mix = GaussianMixture(n_components = 5)
    gaus_mix.fit(df_raw)
    y_gaus = gaus_mix.predict(df_raw)
    
    dbscan = DBSCAN(eps=1.2)
    dbscan.fit(df_raw)
    y_dbscan = dbscan.predict(df_raw)
    
    naive_bayes = GaussianNB()
    naive_bayes.fit(x_train, y_train)
    y_pred = naive_bayes.predict(x_valid)
    report = classification_report(y_valid, y_pred)

    df_corr = df_raw.corr()[[&#39;Outcome&#39;]].sort_values(by = &#39;Outcome&#39;)
    sns.heatmap(df_corr, annot = True)
    
    sns.scatterplot(x = , y = , hue = )
    
    inertia = []
    K = range(1, 10)
    
    for i in K:
      kmeans = KMeans(n_clsuters = i)
      kmeans.fit(df_raw)
      inertia.append(kmeans.inertia_)
    
    plt.plot(K, inertia)
    plt.xlabel(&#39;K&#39;)
    plt.ylabel(&#39;Inertia&#39;)
    plt.show()
    
    numeric_columns = list(df_raw.select_dtypes(include = [np.number]).columns)
    for i, col in enumerate(numeric_columns):
      plt.figure()
      sns.boxplot(data = df_raw, x = &#39;Target&#39;, y = col)
    
    from sklearn.preprocessing import OneHotEncoder
    cat = [&#39;Sex&#39;, &#39;Embarked&#39;]
    df_cat = df_raw[cat]
    df_encoded = pd.get_dummies(df_cat)
    
    df_new = pd.concat([df_raw, df_encoded])
    df_new = df_new.drop(cat, axis = 1)
    
    from sklearn.model_selection import GridSearchCV
    
    params_grid = {&#39;C&#39;: [1, 10 , 100, 1000],
                   &#39;gamma&#39; : [0.00001, 0.001, 0.01, 0.1],
                   &#39;kernel&#39;: [&#39;rbf&#39;]}
    
    grid = GridSearchCV(SVC(), params_grid, refit = True, verbose = 1)
    
    grid.best_params_
    
    from sklearn.preprocessing import PolynomialFeatures
    
    #still broken
    poly_reg = PolynomialFeatures(degree = 2)
    
    lm.fit(poly_reg, y)
    
    x_val_poly = poly_reg(x_val)





</details>



huangapple
  • 本文由 发表于 2023年2月14日 21:30:28
  • 转载请务必保留本文链接:https://go.coder-hub.com/75448554.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定