英文:
Adding numbers using Neural network + back propagation
问题
我正在尝试使用神经网络将数字相加,但似乎无法使反向传播函数正常工作。
这是神经网络的外观,其中 W1 = x1,W2 = x2,W3 = y1,W4 = y2,W5 = z1,W6 = z2。
这是到目前为止我的代码:
from random import randint, random, uniform
import numpy as np
class Data:
data_dict = {}
def __init__(self, limit):
self.limit = limit
'''creates data but beware that the limit may not be the same as the size of the dictionary'''
def create_data(self):
for i in range(self.limit):
num1 = randint(0, 100)
num2 = randint(0, 100)
self.data_dict[(num1, num2)] = num1 + num2
'''you compare the error with every test in the data set and find weights that minimise the error'''
class Neural:
def __init__(self, data):
self.x1 = uniform(-1, 1)
self.x2 = uniform(-1, 1)
self.y1 = uniform(-1, 1)
self.y2 = uniform(-1, 1)
self.z1 = uniform(-1, 1)
self.z2 = uniform(-1, 1)
self.data = data
def relu(self, number):
return max(0, number)
def sigmoid(self, number):
return 1 / (1 + np.exp(-number))
'''weighted summation with activation function to compute output'''
def compute_output(self, num1, num2):
hidden_layer_input1 = self.sigmoid((num1 * self.x1) + (num2 * self.y1))
hidden_layer_input2 = self.sigmoid((num1 * self.x2) + (num2 * self.y2))
return ((hidden_layer_input1 * self.z1) + (hidden_layer_input2 * self.z2))
'''mean squared error between the actual output with the output generated by the algorithm'''
def compare_output(self, data):
'''actually, better to find error between all tests. add all the errors up'''
error = 0
for key in data.data_dict:
error += abs(data.data_dict[key] - self.compute_output(key[0], key[1])) ** 2
return error / len(data.data_dict)
'''TODO function that changes the weight depending on the errors using gradient descent'''
'''first make it random'''
'''next perhaps change weights for each test and average out the adjustments for each weight'''
def random_back_propagation(self):
error = 100000
while error > 0.1:
self.x1 = random()
self.x2 = random()
self.y1 = random()
self.y2 = random()
self.z1 = random()
self.z2 = random()
error = self.compare_output(self.data)
print(error)
print(self.compute_output(140, 15))
'''learning rate is the amount the weights are updated during training'''
def back_propagation(self, learning_rate):
for _ in range(1000):
for key in self.data.data_dict:
num1, num2 = key
target = self.data.data_dict[key]
hidden_layer1_output = self.sigmoid((num1 * self.x1) + (num2 * self.y1))
hidden_layer2_output = self.sigmoid((num1 * self.x2) + (num2 * self.y2))
output = ((hidden_layer1_output * self.z1) + (hidden_layer2_output * self.z2))
error = target - output
output_unit1 = output * (1 - output) * (error)
hidden_unit1 = hidden_layer1_output * (1 - hidden_layer1_output) * self.z1 * output_unit1
hidden_unit2 = hidden_layer2_output * (1 - hidden_layer2_output) * self.z2 * output_unit1
self.z1 += (learning_rate * output * hidden_layer1_output)
self.z2 += (learning_rate * output * hidden_layer2_output)
self.x2 += (learning_rate * hidden_unit2 * num1)
self.x1 += (learning_rate * hidden_unit1 * num1)
self.y1 += (learning_rate * hidden_unit1 * num2)
self.y2 += (learning_rate * hidden_unit2 * num2)
print(self.x1, self.x2, self.y1, self.y2, self.z1, self.z2)
print(num1, num2, self.compute_output(num1, num2))
data = Data(200)
data.create_data()
neural = Neural(data)
neural.back_propagation(0.01)
print(neural.compute_output(15, 7))
英文:
I am trying to use neural networks to add numbers toegther but i can't seem to make the back propagation function working.
This is how the neural network looks like where W1 = x1, W2 = x2, W3 = y1, W4 = y2, W5 = z1 and W6 = z2
This is my code so far:
from random import randint,random ,uniform
import numpy as np
class Data:
data_dict= {}
def __init__(self,limit):
self.limit = limit
'''creates data but beware that the limit may not be the same as the size of the dictionary'''
def create_data(self):
for i in range(self.limit):
num1 = randint(0,100)
num2 = randint(0,100)
self.data_dict[(num1,num2)] = num1+num2
''' you compare the error with every test in the data set and find weights that minimise the error'''
class Neural:
def __init__(self,data):
self.x1 = uniform(-1,1)
self.x2 = uniform(-1,1)
self.y1 = uniform(-1,1)
self.y2 = uniform(-1,1)
self.z1 = uniform(-1,1)
self.z2 = uniform(-1,1)
self.data=data
def relu(self,number):
return max(0,number)
def sigmoid(self,number):
return 1/(1 + np.exp(-number))
'''weighted summation with activation function to compute output '''
def compute_output(self,num1,num2):
hidden_layer_input1 = self.sigmoid((num1 * self.x1) + (num2 * self.y1))
hidden_layer_input2 = self.sigmoid((num1 * self.x2) + (num2 * self.y2))
return ((hidden_layer_input1*self.z1) + (hidden_layer_input2 * self.z2))
'''mean swaured error error between the actual output with the output generated by the algorithm '''
def compare_ouput(self,data):
'''actually,better to find error between all tests. add all the errors up'''
error = 0
for key in data.data_dict:
error += abs(data.data_dict[key] - self.compute_output(key[0],key[1])) **2
return error/len(data.data_dict)
# return abs(actual - self.compute_output(num1,num2))
'''TODO function that changes the weight depending on the errors using gradient descent'''
'''first make it random'''
'''next perhaps change weights for each test and average out the adjustments for each weight'''
def random_back_propagation(self):
error = 100000
while error>0.1:
self.x1 = random()
self.x2 = random()
self.y1 = random()
self.y2 = random()
self.z1 = random()
self.z2 = random()
error = self.compare_ouput(self.data)
print(error)
print(self.compute_output(140,15))
'''learning rate is the amount the weights are updated during training'''
def back_propagation(self, learning_rate):
for _ in range(1000):
for key in self.data.data_dict:
num1, num2 = key
target = self.data.data_dict[key]
hidden_layer1_output = self.sigmoid((num1 * self.x1) + (num2 * self.y1))
hidden_layer2_output = self.sigmoid((num1 * self.x2) + (num2 * self.y2))
output = ((hidden_layer1_output * self.z1) + (hidden_layer2_output * self.z2))
error = target - output
#check if you are happpy with the error
output_unit1 = output * (1 - output) * (error)
hidden_unit1 = hidden_layer1_output * (1 - hidden_layer1_output) * self.z1 * output_unit1
hidden_unit2 = hidden_layer2_output * (1 - hidden_layer2_output) * self.z2 * output_unit1
self.z1 += (learning_rate * output * hidden_layer1_output)
self.z2 += (learning_rate * output * hidden_layer2_output)
self.x2 += (learning_rate * hidden_unit2 * num1)
self.x1 += (learning_rate * hidden_unit1 * num1)
self.y1 += (learning_rate * hidden_unit1 * num2)
self.y2 += (learning_rate * hidden_unit2 * num2)
print(self.x1,self.x2,self.y1,self.y2,self.z1,self.z2)
print(num1,num2,self.compute_output(num1,num2))
data = Data(200)
data.create_data()
neural = Neural(data)
neural.back_propagation(0.01)
#print(data.data_array)
#print(uniform(-1,1))
print(neural.compute_output(15,7))
I tried changing the learning rate, number of itterations, number of items in the data set but I'm not sure if the problem is with trying to find correct values to use or if my function is just incorrect
答案1
得分: 0
有一些问题存在于你目前的方法中。我将尽力在接下来的部分进行详细说明。
损失函数
你目前使用的损失函数是error = target - output
。乍一看似乎可以,因为如果target
和output
相等,error
就是0。但请记住,网络将尝试最小化损失函数,在这种情况下,实现这一目标的一种方式是生成一个极大的output
,使error
为负数。
我建议使用类似MSE(均方误差)的方法:error = (target - output) ** 2
。这样,如果网络想要将其最小化,它将不得不尽量使target
和output
相等。
梯度更新
当前你正在使用正梯度来更新参数self.z1 += (learning_rate * gradient)
。梯度指向损失函数增加最快的方向。我们不想增加损失,我们想朝着最大减少的方向前进,因此我们使用负梯度self.z1 -= (learning_rate * gradient)
。
Sigmoid函数
你在一些隐藏单元中使用了Sigmoid作为激活函数。这是Sigmoid的图形:
注意当X > 6或X < -6时,Sigmoid基本上变为一条水平线。这条水平线转化为极小的梯度,几乎为0,这意味着参数更新极其缓慢,甚至根本不更新。网络饱和了。
你输入到网络的数据是比神经网络通常处理的数字更大的数字。考虑到你的一个隐藏单元的权重被随机初始化为0.5和0.3。现在你向网络输入你想要求和的数字:50和20。隐藏单元的计算将是sigmoid(50 * 0.5 + 20 * 0.3)
,即sigmoid(31)
,其导数实际上是0。
如果你仍想使用Sigmoid,我建议预处理你的输入数据,以便网络不会接收到如此大的数字。例如,你可以将每个输入除以100,因为训练数据中可能的最大整数是100。
将所有内容放在一起
下面是修复了上述问题的代码。由于它使用了不同的损失函数,所有的梯度都是不同的(随时验证,因为我对微积分不太自信)。我还添加了一个打印语句以跟踪跨时期的损失。
from random import randint, random, uniform
import numpy as np
...
# (剩余代码略)
英文:
There are some problems with your current approach. I'll try to elaborate in the following sections.
Loss function
The loss function you are currently using is error = target - output
. At first glance it seems to work, because if target
and output
are equal, error
will be 0. But remember that the network will try to minimize the loss function, and in this case one of the ways to do it is by generating an extremely large output
, so that error
will be negative.
I suggest using something like an MSE (Mean Squared Error): error = (target - output) ** 2
. That way, if the network wants to minimize it, it will have no choice but to try to make target
and output
equal.
Gradient updates
Currently you are updating your parameters with the positive gradient self.z1 += (learning_rate * gradient)
. The gradient points to the direction of greatest increase of the loss function. We don't want to increase the loss. We want to go in the direction of greatest decrease, so we use the negative gradient self.z1 -= (learning_rate * gradient)
.
The sigmoid function
You are using a sigmoid as the activation function for some of your hidden units. Here's the sigmoid graph:
Notice that when X > 6 or X < -6, the sigmoid basically flatlines. This flatline translates into an extremely small gradient, practically 0, and that means the parameters update extremely slowly or not at all. The network is saturated.
The data that you are feeding to the network consists of numbers that are larger than what neural networks normally work with. Consider that one of your hidden units had its weights randomly initialized to 0.5 and 0.3. Now you feed the network with the numbers you wish to sum: 50 and 20. The computation at the hidden unit will be sigmoid(50 * 0.5 + 20 * 0.3)
which is sigmoid(31)
, and its derivative is effectively 0.
If you still wanna use the sigmoid, I recommend that you preprocess your input data so that the network won't receive such large numbers. For example, you can divide every input by 100, as the largest possible integer in the training data is 100.
Putting it all together
Below is the code with the above mentioned problems fixed. As it uses a different loss function, all of the gradients are different (feel free to double check as I'm not that confident in calculus). I also added a print to keep track of the loss across epochs.
from random import randint, random, uniform
import numpy as np
class Data:
data_dict = {}
def __init__(self, limit):
self.limit = limit
"""creates data but beware that the limit may not be the same as the size of the dictionary"""
def create_data(self):
for i in range(self.limit):
num1 = randint(0, 100)
num2 = randint(0, 100)
self.data_dict[(num1, num2)] = num1 + num2
""" you compare the error with every test in the data set and find weights that minimise the error"""
class Neural:
def __init__(self, data):
self.x1 = uniform(-1, 1)
self.x2 = uniform(-1, 1)
self.y1 = uniform(-1, 1)
self.y2 = uniform(-1, 1)
self.z1 = uniform(-1, 1)
self.z2 = uniform(-1, 1)
self.data = data
def relu(self, number):
return max(0, number)
def sigmoid(self, number):
return 1 / (1 + np.exp(-number))
"""weighted summation with activation function to compute output """
def compute_output(self, num1, num2):
num1, num2 = num1 / 100, num2 / 100
hidden_layer_input1 = self.sigmoid((num1 * self.x1) + (num2 * self.y1))
hidden_layer_input2 = self.sigmoid((num1 * self.x2) + (num2 * self.y2))
return 100 * ((hidden_layer_input1 * self.z1) + (hidden_layer_input2 * self.z2))
"""mean swaured error error between the actual output with the output generated by the algorithm """
def compare_ouput(self, data):
"""actually,better to find error between all tests. add all the errors up"""
error = 0
for key in data.data_dict:
error += abs(data.data_dict[key] - self.compute_output(key[0], key[1])) ** 2
return error / len(data.data_dict)
# return abs(actual - self.compute_output(num1,num2))
"""TODO function that changes the weight depending on the errors using gradient descent"""
"""first make it random"""
"""next perhaps change weights for each test and average out the adjustments for each weight"""
def random_back_propagation(self):
error = 100000
while error > 0.1:
self.x1 = random()
self.x2 = random()
self.y1 = random()
self.y2 = random()
self.z1 = random()
self.z2 = random()
error = self.compare_ouput(self.data)
print(error)
print(self.compute_output(140, 15))
"""learning rate is the amount the weights are updated during training"""
def back_propagation(self, learning_rate):
for epoch in range(1000):
errors = []
for key in self.data.data_dict:
num1, num2 = key
target = self.data.data_dict[key]
# Rescaling everything
num1, num2 = num1 / 100, num2 / 100
target = target / 100
hidden_layer1_output = self.sigmoid((num1 * self.x1) + (num2 * self.y1))
hidden_layer2_output = self.sigmoid((num1 * self.x2) + (num2 * self.y2))
output = (hidden_layer1_output * self.z1) + (hidden_layer2_output * self.z2)
# loss
error = (target - output) ** 2
# check if you are happpy with the error
# derivative of error with respect to output
# d(error)/d(output) = -2 * (target - output)
# derivative of error with respect to x1
# d(error)/d(x1) = d(error)/d(output) * d(output)/d(x1)
# /-> hidden_layer1_output
# d(error)/d(x1) = d(error)/d(output) * ( d(output)/d(hl1) * d(hl1)/d(x1) )
# derivative of output with respect to hl1
# d(output)/d(hl1) = z1
# derivative of hl1 with respect to x1
# d(hl1)/d(x1) = hidden_layer1_output * (1 - hidden_layer1_output) * num1
# d(error)/d(x1) = -2 * (target - output) * z1 * hidden_layer1_output * (1 - hidden_layer1_output) * num1
self.x1 -= learning_rate * -2 * (target - output) * self.z1 * hidden_layer1_output * (1 - hidden_layer1_output) * num1
self.y1 -= learning_rate * -2 * (target - output) * self.z1 * hidden_layer1_output * (1 - hidden_layer1_output) * num2
self.x2 -= learning_rate * -2 * (target - output) * self.z2 * hidden_layer2_output * (1 - hidden_layer2_output) * num1
self.y2 -= learning_rate * -2 * (target - output) * self.z2 * hidden_layer2_output * (1 - hidden_layer2_output) * num2
# derivative of error with respect to z1
# d(error)/d(z1) = d(error)/d(output) * d(output)/d(z1)
# derivative of output with respect to z1
# d(output)/d(z1) = hidden_layer1_output
# d(error)/d(z1) = -2(target - output) * hidden_layer1_output
self.z1 -= learning_rate * -2 * (target - output) * hidden_layer1_output
self.z2 -= learning_rate * -2 * (target - output) * hidden_layer2_output
# print(self.x1, self.x2, self.y1, self.y2, self.z1, self.z2)
errors.append(error)
print(f"Mean error: {np.mean(errors)}")
data = Data(2000)
data.create_data()
neural = Neural(data)
neural.back_propagation(0.1)
print("#################################PREDICTIONS############################################")
print(f"15 + 7 = {neural.compute_output(15, 7)}")
print(f"3 + 2 = {neural.compute_output(3, 2)}")
print(f"50 + 70 = {neural.compute_output(50, 70)}")
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论