英文:
MNIST NN from scratch, output converge on one number
问题
以下是翻译好的部分:
在观看了Samson Zhang的视频和3blue1brown之后,我尝试从头开始编写神经网络(NN)。(代码在很大程度上参考了Samson的代码,但我确实对他是如何得出公式进行了全面的推导)。我写下了方程和推导,以确保我完全理解它的工作原理。但当我运行模型时,准确率在约17%左右开始下降,似乎都趋近于一个数字,而当它们这样做时,我也会收到错误消息。
RuntimeWarning: overflow encountered in exp a2=np.exp(z)/sum(np.exp(z))
RuntimeWarning: invalid value encountered in divide a2=np.exp(z)/sum(np.exp(z))
每个输入、节点、权重和偏置的维度如下:
a0(输入)= m x 784
w1 = 784 x 20
b1 = 1 x 20
a1 = m x 20
w2 = 20 x 10
b2 = 1 x 10
a2(输出)= m x 10
y(独热编码)= m x 10
我尝试将学习率(alpha)更改为较小的数字,但最终它们都导致相同的错误。
import numpy as np
import pandas as pd
m, n = data.shape
x_train=data[0:m,1:785]
y_train=data[0:m,0]
x_train=x_train/255
def init_param():
w1=np.random.rand(784,20)-0.5
b1=np.random.rand(1,20)-0.5
w2=np.random.rand(20,10)-0.5
b2=np.random.rand(1,10)-0.5
return w1, b1, w2, b2
def ReLU(z):
return np.maximum(z,0)
def Softmax(z):
a2=np.exp(z)/sum(np.exp(z))
return a2
def f_propagation(a0,w1,b1,w2,b2):
z1 = a0.dot(w1)+b1 # m x 20
a1 = ReLU(z1) # m x 20
z2 = a1.dot(w2)+b2 # m x 10
a2 = Softmax(z2) # m x 10
return z1, a1, z2, a2
def dev_ReLU(z):
return z > 0
def one_hotencode(y):
y_hat=np.zeros((np.size(y),10))
y_hat[np.arange(y.size),y] = 1
return y_hat
def b_propagation(x,y,z1,w1,a1,z2,w2,a2):
y_hat = one_hotencode(y)
dadc = a2 - y_hat # m x 10 从这里开始
dw2 = 1/m * (a1.T.dot(dadc)) #20 x 10
db2 = 1/m * np.sum(dadc,axis=0) #1 x 10
dw1 = 1/m * x.T.dot((w2.dot(dadc.T).T * dev_ReLU(z1))) # 784 x 20
db1 = 1/m * np.sum((w2.dot(dadc.T).T * dev_ReLU(z1)),axis=0) #1 x 20
return dw2, db2, dw1, db1
def update_param(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha):
w1 = w1 - alpha * dw1
b1 = b1 - alpha * db1
w2 = w2 - alpha * dw2
b2 = b2 - alpha * db2
return w1, b1, w2, b2
def get_predictions(A2):
return np.argmax(A2, 0)
def get_accuracy(predictions, Y):
print(predictions, Y)
return np.sum(predictions == Y) / Y.size
def gradient_descent(x, y, alpha=0.01, iterations=500):
w1, b1, w2, b2 = init_param()
for i in range(iterations):
z1, a1, z2, a2 = f_propagation(x,w1,b1,w2,b2)
dw2, db2, dw1, db1 = b_propagation(x,y,z1,w1,a1,z2,w2,a2)
w1, b1, w2, b2 = update_param(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha)
if i % 10 == 0:
print("迭代次数: ", i)
predictions = get_predictions(a2.T)
print(get_accuracy(predictions, y))
return w1, b1, w2, b2
w1, b1, w2, b2 = gradient_descent(x_train, y_train, 0.01, 500)
结果:
迭代次数: 0
[5 0 2 ... 1 7 3] [4 3 7 ... 8 1 1]
0.08547619047619047
迭代次数: 10
[5 7 2 ... 1 3 3] [4 3 7 ... 8 1 1]
0.09669047619047619
迭代次数: 20
[5 7 7 ... 1 3 3] [4 3 7 ... 8 1 1]
0.10857142857142857
迭代次数: 30
[5 7 7 ... 1 3 3] [4 3 7 ... 8 1 1]
0.11978571428571429
迭代次数: 40
[5 7 7 ... 1 3 3] [4 3 7 ... 8 1 1]
0.12797619047619047
迭代次数: 50
[5 7 7 ... 1 3 3] [4 3 7 ... 8 1 1]
0.13035714285714287
迭代次数: 60
[5 7 7 ... 1 3 3] [4 3 7 ... 8 1 1]
0.12047619047619047
迭代次数: 70
[5 1 1
<details>
<summary>英文:</summary>
After watching Samson Zhang's video and 3blue1brown I tried to write the NN from scratch. (The code is heavily referenced from Samson's, but did go through the whole derivation of how he got the formula) I wrote down the equations and derivation to make sure I fully understand how it worked. But when I run the model, accuracy starts going down at around 17% and seems to all verge on 1 number and when they do I also get error messages.
`RuntimeWarning: overflow encountered in exp a2=np.exp(z)/sum(np.exp(z))`
`RuntimeWarning: invalid value encountered in divide a2=np.exp(z)/sum(np.exp(z))`
The dimensions for each input, node, weights, biases are as below.
a0(input) = m x 784
w1 = 784 x 20
b1 = 1 x 20
a1 = m x 20
w2 = 20 x 10
b2 = 1 x 10
a2(output) = m x 10
y(one_hot_encoded) = m x 10
I have tried changing the learning rate(alpha) to a smaller number, but they all lead to the same error in the end.
import numpy as np
import pandas as pd
m, n = data.shape
x_train=data[0:m,1:785]
y_train=data[0:m,0]
x_train=x_train/255
def init_param():
w1=np.random.rand(784,20)-0.5
b1=np.random.rand(1,20)-0.5
w2=np.random.rand(20,10)-0.5
b2=np.random.rand(1,10)-0.5
return w1, b1, w2, b2
def ReLU(z):
return np.maximum(z,0)
def Softmax(z):
a2=np.exp(z)/sum(np.exp(z))
return a2
def f_propagation(a0,w1,b1,w2,b2):
z1 = a0.dot(w1)+b1 # m x 20
a1 = ReLU(z1) # m x 20
z2 = a1.dot(w2)+b2 # m x 10
a2 = Softmax(z2) # m x 10
return z1, a1, z2, a2
def dev_ReLU(z):
return z>0
def one_hotencode(y):
y_hat=np.zeros((np.size(y),10))
y_hat[np.arange(y.size),y] = 1
return y_hat
def b_propagation(x,y,z1,w1,a1,z2,w2,a2):
y_hat = one_hotencode(y)
dadc = a2 - y_hat # m x 10 Start from here
dw2 = 1/m * (a1.T.dot(dadc)) #20 x 10
db2 = 1/m * np.sum(dadc,axis=0) #1 x 10
dw1 = 1/m * x.T.dot((w2.dot(dadc.T).T * dev_ReLU(z1))) # 784 x 20
db1 = 1/m * np.sum((w2.dot(dadc.T).T * dev_ReLU(z1)),axis=0) #1 x 20
return dw2, db2, dw1, db1
def update_param(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha):
w1 = w1 - alpha * dw1
b1 = b1 - alpha * db1
w2 = w2 - alpha * dw2
b2 = b2 - alpha * db2
return w1, b1, w2, b2
def get_predictions(A2):
return np.argmax(A2, 0)
def get_accuracy(predictions, Y):
print(predictions, Y)
return np.sum(predictions == Y) / Y.size
def gradient_descent(x, y, alpha=0.01, iterations=500):
w1, b1, w2, b2 = init_param()
for i in range(iterations):
z1, a1, z2, a2 = f_propagation(x,w1,b1,w2,b2)
dw2, db2, dw1, db1 = b_propagation(x,y,z1,w1,a1,z2,w2,a2)
w1, b1, w2, b2 = update_param(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha)
if i % 10 == 0:
print("Iteration: ", i)
predictions = get_predictions(a2.T)
print(get_accuracy(predictions, y))
return w1, b1, w2, b2
w1, b1, w2, b2 = gradient_descent(x_train, y_train, 0.01, 500)
Results:
Iteration: 0
[5 0 2 ... 1 7 3] [4 3 7 ... 8 1 1]
0.08547619047619047
Iteration: 10
[5 7 2 ... 1 3 3] [4 3 7 ... 8 1 1]
0.09669047619047619
Iteration: 20
[5 7 7 ... 1 3 3] [4 3 7 ... 8 1 1]
0.10857142857142857
Iteration: 30
[5 7 7 ... 1 3 3] [4 3 7 ... 8 1 1]
0.11978571428571429
Iteration: 40
[5 7 7 ... 1 3 3] [4 3 7 ... 8 1 1]
0.12797619047619047
Iteration: 50
[5 7 7 ... 1 3 3] [4 3 7 ... 8 1 1]
0.13035714285714287
Iteration: 60
[5 7 7 ... 1 3 3] [4 3 7 ... 8 1 1]
0.12047619047619047
Iteration: 70
[5 1 1 ... 1 3 3] [4 3 7 ... 8 1 1]
0.09740476190476191
Iteration: 80
[3 1 1 ... 1 3 3] [4 3 7 ... 8 1 1]
0.0975952380952381
Iteration: 90
[3 1 1 ... 1 3 3] [4 3 7 ... 8 1 1]
0.11626190476190476
Iteration: 100
[1 1 1 ... 1 1 1] [4 3 7 ... 8 1 1]
0.12088095238095238
Iteration: 110
[1 1 1 ... 1 1 1] [4 3 7 ... 8 1 1]
0.112
Iteration: 120
[1 1 1 ... 1 1 1] [4 3 7 ... 8 1 1]
0.11152380952380953
Iteration: 130
[1 1 1 ... 1 1 1] [4 3 7 ... 8 1 1]
0.11152380952380953
Iteration: 140
[1 1 1 ... 1 1 1] [4 3 7 ... 8 1 1]
0.11152380952380953
/tmp/ipykernel_29/2242731165.py:13: RuntimeWarning: overflow encountered in exp
a2=np.exp(z)/sum(np.exp(z))
/tmp/ipykernel_29/2242731165.py:13: RuntimeWarning: invalid value encountered in divide
a2=np.exp(z)/sum(np.exp(z))
</details>
# 答案1
**得分**: 1
你离一个可行的解决方案非常接近,以下是你的代码的工作版本,我只是更加仔细地关注了softmax的计算,以避免溢出和下溢:
```python
from sklearn.datasets import load_digits
import numpy as np
# 加载MNIST数据集
mnist = load_digits()
# 提取特征(输入)和标签(输出)
x_train = mnist.data # 输入特征
y_train = mnist.target # 输出标签
# 标准化输入特征
x_train = x_train / 255.0
def init_param():
w1 = np.random.rand(64, 20) - 0.5
b1 = np.random.rand(1, 20) - 0.5
w2 = np.random.rand(20, 10) - 0.5
b2 = np.random.rand(1, 10) - 0.5
return w1, b1, w2, b2
def ReLU(z):
return np.maximum(z, 0)
def Softmax(z):
exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
softmax = exp_z / np.sum(exp_z, axis=1, keepdims=True)
return softmax
def f_propagation(a0, w1, b1, w2, b2):
z1 = np.dot(a0, w1) + b1 # m x 20
a1 = ReLU(z1) # m x 20
z2 = np.dot(a1, w2) + b2 # m x 10
a2 = Softmax(z2) # m x 10
return z1, a1, z2, a2
def dev_ReLU(z):
return z > 0
def one_hotencode(y):
y_hat = np.zeros((y.size, 10))
y_hat[np.arange(y.size), y] = 1
return y_hat
def b_propagation(x, y, z1, w1, a1, z2, w2, a2):
y_hat = one_hotencode(y)
dadz2 = a2 - y_hat # m x 10
dw2 = np.dot(a1.T, dadz2) # 20 x 10
db2 = np.sum(dadz2, axis=0) # 1 x 10
dadz1 = np.dot(dadz2, w2.T) * dev_ReLU(z1) # m x 20
dw1 = np.dot(x.T, dadz1) # 64 x 20
db1 = np.sum(dadz1, axis=0) # 1 x 20
return dw2, db2, dw1, db1
def update_param(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha):
w1 = w1 - alpha * dw1
b1 = b1 - alpha * db1
w2 = w2 - alpha * dw2
b2 = b2 - alpha * db2
return w1, b1, w2, b2
def get_predictions(A2):
return np.argmax(A2, axis=1)
def get_accuracy(predictions, Y):
return np.mean(predictions == Y)
def gradient_descent(x, y, alpha=0.01, iterations=500):
m, n = x.shape
w1, b1, w2, b2 = init_param()
for i in range(iterations):
z1, a1, z2, a2 = f_propagation(x, w1, b1, w2, b2)
dw2, db2, dw1, db1 = b_propagation(x, y, z1, w1, a1, z2, w2, a2)
w1, b1, w2, b2 = update_param(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha)
if i % 10 == 0:
print("Iteration: ", i)
predictions = get_predictions(a2)
accuracy = get_accuracy(predictions, y)
print("Accuracy:", accuracy)
return w1, b1, w2, b2
w1, b1, w2, b2 = gradient_descent(x_train, y_train, 0.0001, 5000)
希望这可以帮助你。如果你有其他问题,请告诉我。
英文:
You were very close to a working solution, here is a working version of your code where I just gave closer attention to the computation of the softmax to avoid overflows and underflows:
from sklearn.datasets import load_digits
import numpy as np
# Load the MNIST dataset
mnist = load_digits()
# Extract the features (input) and labels (output)
x_train = mnist.data # Input features
y_train = mnist.target # Output labels
# Normalize the input features
x_train = x_train / 255.0
def init_param():
w1 = np.random.rand(64, 20) - 0.5
b1 = np.random.rand(1, 20) - 0.5
w2 = np.random.rand(20, 10) - 0.5
b2 = np.random.rand(1, 10) - 0.5
return w1, b1, w2, b2
def ReLU(z):
return np.maximum(z, 0)
def Softmax(z):
exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
softmax = exp_z / np.sum(exp_z, axis=1, keepdims=True)
return softmax
def f_propagation(a0, w1, b1, w2, b2):
z1 = np.dot(a0, w1) + b1 # m x 20
a1 = ReLU(z1) # m x 20
z2 = np.dot(a1, w2) + b2 # m x 10
a2 = Softmax(z2) # m x 10
return z1, a1, z2, a2
def dev_ReLU(z):
return z > 0
def one_hotencode(y):
y_hat = np.zeros((y.size, 10))
y_hat[np.arange(y.size), y] = 1
return y_hat
def b_propagation(x, y, z1, w1, a1, z2, w2, a2):
y_hat = one_hotencode(y)
dadz2 = a2 - y_hat # m x 10
dw2 = np.dot(a1.T, dadz2) # 20 x 10
db2 = np.sum(dadz2, axis=0) # 1 x 10
dadz1 = np.dot(dadz2, w2.T) * dev_ReLU(z1) # m x 20
dw1 = np.dot(x.T, dadz1) # 64 x 20
db1 = np.sum(dadz1, axis=0) # 1 x 20
return dw2, db2, dw1, db1
def update_param(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha):
w1 = w1 - alpha * dw1
b1 = b1 - alpha * db1
w2 = w2 - alpha * dw2
b2 = b2 - alpha * db2
return w1, b1, w2, b2
def get_predictions(A2):
return np.argmax(A2, axis=1)
def get_accuracy(predictions, Y):
return np.mean(predictions == Y)
def gradient_descent(x, y, alpha=0.01, iterations=500):
m, n = x.shape
w1, b1, w2, b2 = init_param()
for i in range(iterations):
z1, a1, z2, a2 = f_propagation(x, w1, b1, w2, b2)
dw2, db2, dw1, db1 = b_propagation(x, y, z1, w1, a1, z2, w2, a2)
w1, b1, w2, b2 = update_param(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha)
if i % 10 == 0:
print("Iteration: ", i)
predictions = get_predictions(a2)
accuracy = get_accuracy(predictions, y)
print("Accuracy:", accuracy)
return w1, b1, w2, b2
w1, b1, w2, b2 = gradient_descent(x_train, y_train, 0.0001, 5000)
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论