将TensorFlow模型转换为PyTorch模型 – 模型没有学习

huangapple go评论62阅读模式
英文:

Convert Tensoflow model to PyTorch model - model isn't learning

问题

以下是您要翻译的代码部分:

import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D
from tensorflow.keras.datasets import imdb

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=5000)
x_train = sequence.pad_sequences(x_train, maxlen=400, padding="post")
x_test = sequence.pad_sequences(x_test, maxlen=400, padding="post")
model = Sequential()
model.add(Embedding(5000, 50, input_length=400))
model.add(Dropout(0.2))
model.add(Conv1D(250, 3, padding='valid', activation='relu', strides=1))
model add(GlobalMaxPooling1D())
model.add(Dense(250))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
h2 = model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_test, y_test))
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch
from tqdm import tqdm
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

train_dataloader = DataLoader(CustomDataset(torch.Tensor(x_train), torch.Tensor(y_train)), batch_size=32, shuffle=True)
test_dataloader = DataLoader(CustomDataset(torch.Tensor(x_test), torch.Tensor(y_test)), batch_size=32, shuffle=True)

class MyModel(torch.nn.Module):
    def __init__(self, vocab_size=5000, input_len=400, embedding_dims=50, kernel_size=3, filters=250, hidden_dims=250):
        super(MyModel, self).__init__()
        self.embedding_dims = embedding_dims
        self.input_len = input_len
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dims)
        self.dropout1 = torch.nn.Dropout(p=0.2)
        self.conv1d = torch.nn.Conv1d(in_channels=embedding_dims, out_channels=filters, kernel_size=kernel_size, padding=(0,), stride=1)
        self.pool = torch.nn.AdaptiveMaxPool1d(1)
        self.linear1 = torch.nn.Linear(in_features=hidden_dims, out_features=hidden_dims)
        self.dropout2 = torch.nn.Dropout(p=0.2)
        self.activation = torch.nn.ReLU()
        self.output = torch.nn.Linear(in_features=hidden_dims, out_features=1)
        self.activation2 = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.dropout1(self.embedding(x.type(torch.LongTensor)))
        x = self.conv1d(x.view(-1, self.embedding_dims, self.input_len))
        x = self.pool(x)
        x = self.activation(self.dropout2(self.linear1(x.view(-1,x.size()[1])))
        x = self.activation2(self.output(x))
        return x

class FitTorchModel():
    def __init__(self, model, num_epochs=10, steps_per_epoch=782):
        self.model = model
        self.epochs = num_epochs
        self.steps_per_epoch = steps_per_epoch

    def fit(self, train_dataloader, test_dataloader):
        opt = torch.optim.Adam(self.model.parameters(), lr=0.001)
        crit = torch.nn.BCELoss(reduction = "mean")
        history_df = pd.DataFrame(columns = ["Loss", "Accuracy", "Val_Loss", "Val_Acc"])
        for epoch in range(self.epochs):
            self.model.train()
            print(f"Epoch {epoch}")
            epoch_loss = 0
            epoch_acc = 0
            it = iter(train_dataloader)
            for step in tqdm(range(self.steps_per_epoch)):
                opt.zero_grad()
                x, y = next(it)
                y_pred = self.model(x).view(-1)
                loss = crit(y_pred, y)
                epoch_loss += loss.item()
                epoch_acc += accuracy_score(y==1, y_pred > 0.5)
                loss.backward()
                opt.step()
            val_loss, val_acc = self.predict_proba(test_dataloader, crit)
            df = pd.DataFrame({"Loss": epoch_loss/(step+1), 
                               "Accuracy": epoch_acc/(step+1),
                               "Val_Loss": val_loss, "Val_Acc": val_acc}, index=[0]) 
            history_df = pd.concat((history_df, df), ignore_index=True) 
        return history_df

    def predict_proba(self, test_dataloader, crit):
        self.model.eval()
        val_loss = 0
        val_acc = 0
        it = iter(test_dataloader)
        with torch.no_grad():    
            for step in tqdm(range(self.steps_per_epoch)):
                x,y = next(it)
                y_pred = self.model(x).view(-1)
                batch_loss = crit(y_pred, y)
                val_loss += batch_loss.item()
                val_acc += accuracy_score(y==1, y_pred > 0.5)
        return  val_loss/(step+1), val_acc/(step+1)

ftm = FitTorchModel(model=MyModel(), num_epochs=10, steps_per_epoch=782)
history_df = ftm.fit(train_dataloader, test_dataloader)
英文:

I'm trying to port a tensorflow neural network to pytorch, as an exercise to familiarize myself with both / their nuances. This is the tensorflow network I'm porting to pytorch:

import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D
from tensorflow.keras.datasets import imdb
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=5000)
x_train = sequence.pad_sequences(x_train, maxlen=400, padding="post")
x_test = sequence.pad_sequences(x_test, maxlen=400, padding="post")
model = Sequential()
model.add(Embedding(5000, 50, input_length=400))
model.add(Dropout(0.2))
model.add(Conv1D(250, 3, padding='valid',activation='relu',strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dense(250))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
h2 = model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_test, y_test))

The shapes of each layer is shown below:

Model: "sequential"
_________________________________________________________________
Layer (type)                Output Shape              Param #   
=================================================================
embedding (Embedding)       (None, 400, 50)           250000    
dropout (Dropout)           (None, 400, 50)           0         
conv1d (Conv1D)             (None, 398, 250)          37750     
global_max_pooling1d (Globa  (None, 250)              0         
lMaxPooling1D)                                                  
dense (Dense)               (None, 250)               62750     
dropout_1 (Dropout)         (None, 250)               0         
activation (Activation)     (None, 250)               0         
dense_1 (Dense)             (None, 1)                 251       
activation_1 (Activation)   (None, 1)                 0         
=================================================================
Total params: 350,751
Trainable params: 350,751
Non-trainable params: 0

And the output of the tensorflow model is:

Epoch 1/10
loss: 0.4043 - accuracy: 0.8021 - val_loss: 0.2764 - val_accuracy: 0.8854
Epoch 2/10
loss: 0.2332 - accuracy: 0.9052 - val_loss: 0.2690 - val_accuracy: 0.8888
Epoch 3/10
loss: 0.1598 - accuracy: 0.9389 - val_loss: 0.2948 - val_accuracy: 0.8832
Epoch 4/10
loss: 0.1112 - accuracy: 0.9600 - val_loss: 0.3015 - val_accuracy: 0.8906
Epoch 5/10
loss: 0.0810 - accuracy: 0.9700 - val_loss: 0.3057 - val_accuracy: 0.8868
Epoch 6/10
loss: 0.0537 - accuracy: 0.9811 - val_loss: 0.4055 - val_accuracy: 0.8868
Epoch 7/10
loss: 0.0408 - accuracy: 0.9860 - val_loss: 0.4083 - val_accuracy: 0.8852
Epoch 8/10
loss: 0.0411 - accuracy: 0.9845 - val_loss: 0.4789 - val_accuracy: 0.8789
Epoch 9/10
loss: 0.0380 - accuracy: 0.9862 - val_loss: 0.4828 - val_accuracy: 0.8827
Epoch 10/10
loss: 0.0329 - accuracy: 0.9879 - val_loss: 0.4999 - val_accuracy: 0.8825

Here's what I have in my PyTorch port over:

from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch
from tqdm import tqdm
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
class CustomDataset(Dataset):
def __init__(self, x, y):
self.x = x
self.y = y
def __len__(self):
return len(self.y)
def __getitem__(self, idx):
return self.x[idx], self.y[idx]
train_dataloader = DataLoader(CustomDataset(torch.Tensor(x_train), torch.Tensor(y_train)), batch_size=32, shuffle=True)
test_dataloader = DataLoader(CustomDataset(torch.Tensor(x_test), torch.Tensor(y_test)), batch_size=32, shuffle=True)
class MyModel(torch.nn.Module):
def __init__(self, vocab_size=5000, input_len=400, embedding_dims=50, kernel_size=3, filters=250, hidden_dims=250):
super(MyModel, self).__init__()
self.embedding_dims = embedding_dims
self.input_len = input_len
self.embedding = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dims)
self.dropout1 = torch.nn.Dropout(p=0.2)
self.conv1d = torch.nn.Conv1d(in_channels=embedding_dims, out_channels=filters, kernel_size=kernel_size, padding=(0,), stride=1)
self.pool = torch.nn.AdaptiveMaxPool1d(1)
self.linear1 = torch.nn.Linear(in_features=hidden_dims, out_features=hidden_dims)
self.dropout2 = torch.nn.Dropout(p=0.2)
self.activation = torch.nn.ReLU()
self.output = torch.nn.Linear(in_features=hidden_dims, out_features=1)
self.activation2 = torch.nn.Sigmoid()
def forward(self, x):
x = self.dropout1(self.embedding(x.type(torch.LongTensor)))
x = self.conv1d(x.view(-1, self.embedding_dims, self.input_len))
x = self.pool(x)
x = self.activation(self.dropout2(self.linear1(x.view(-1,x.size()[1]))))
x = self.activation2(self.output(x))
return x
class FitTorchModel():
def __init__(self, model, num_epochs=10, steps_per_epoch=782):
self.model = model
self.epochs = num_epochs
self.steps_per_epoch = steps_per_epoch
def fit(self, train_dataloader, test_dataloader):
opt = torch.optim.Adam(self.model.parameters(), lr=0.001)
crit = torch.nn.BCELoss(reduction = "mean")
history_df = pd.DataFrame(columns = ["Loss", "Accuracy", "Val_Loss", "Val_Acc"])
for epoch in range(self.epochs):
self.model.train()
print(f"Epoch {epoch}")
epoch_loss = 0
epoch_acc = 0
it = iter(train_dataloader)
for step in tqdm(range(self.steps_per_epoch)):
opt.zero_grad()
x, y = next(it)
y_pred = self.model(x).view(-1)
loss = crit(y_pred, y)     
epoch_loss += loss.item()
epoch_acc += accuracy_score(y==1, y_pred > 0.5)
loss.backward()
opt.step()
val_loss, val_acc = self.predict_proba(test_dataloader, crit)
df = pd.DataFrame({"Loss": epoch_loss/(step+1), 
"Accuracy": epoch_acc/(step+1),
"Val_Loss": val_loss, "Val_Acc": val_acc}, index=[0]) 
history_df = pd.concat((history_df, df), ignore_index=True) 
return history_df
def predict_proba(self, test_dataloader, crit):
self.model.eval()
val_loss = 0
val_acc = 0
it = iter(test_dataloader)
with torch.no_grad():    
for step in tqdm(range(self.steps_per_epoch)):
x,y = next(it)
y_pred = self.model(x).view(-1)
batch_loss = crit(y_pred, y)
val_loss += batch_loss.item()
val_acc += accuracy_score(y==1, y_pred > 0.5)
return  val_loss/(step+1), val_acc/(step+1)
ftm = FitTorchModel(model=MyModel(), num_epochs=10, steps_per_epoch=782)
history_df = ftm.fit(train_dataloader, test_dataloader)

The shape of each layer is:

After embedding layer: torch.Size([32, 400, 50])
After dropout1 layer: torch.Size([32, 400, 50])
After convolution1d layer: torch.Size([32, 250, 398])
After maxpooling layer: torch.Size([32, 250, 1])
After linear1 layer: torch.Size([32, 250])
After dropout2 layer: torch.Size([32, 250])
After activation layer: torch.Size([32, 250])
After output layer: torch.Size([32, 1])
After activation2 layer: torch.Size([32, 1])

The output of the pytorch model training is:

       Loss  Accuracy  Val_Loss   Val_Acc
0  0.697899  0.505874  0.692495  0.511629
1  0.693063  0.503477  0.693186  0.503637
2  0.693190  0.496044  0.693149  0.499201
3  0.693181  0.501359  0.693082  0.502038
4  0.693169  0.503237  0.693234  0.495964
5  0.693177  0.500240  0.693154  0.500679
6  0.693069  0.507473  0.693258  0.498881
7  0.693948  0.500320  0.693145  0.501598
8  0.693196  0.499640  0.693164  0.496324
9  0.693170  0.500759  0.693140  0.501918

Couple things: the accuracy hovers around guessing (this is a binary classification task), no matter how many epochs have passed. Secondly, the training loss barely improves. I set the learning rate to the default learning rate described by tensorflow's Adam Optimizer docs. What else am I missing here? I had some trouble with the input / output dimensions for the various layers - did I mess those up at all?

答案1

得分: 2

以下是代码部分的翻译:

class MyModel(torch.nn.Module):
    def __init__(self, vocab_size=5000, input_len=400, embedding_dims=50, kernel_size=3, filters=250, hidden_dims=250):
        super(MyModel, self).__init__()
        self.embedding_dims = embedding_dims
        self.input_len = input_len
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dims)
        self.dropout1 = torch.nn.Dropout(p=0.2)
        self.conv1d = torch.nn.Conv1d(in_channels=embedding_dims, out_channels=filters, kernel_size=kernel_size, padding=(0,), stride=1)
        self.pool = torch.nn.AdaptiveMaxPool1d(1)
        self.linear1 = torch.nn.Linear(in_features=hidden_dims, out_features=hidden_dims)
        self.dropout2 = torch.nn.Dropout(p=0.2)
        self.activation = torch.nn.ReLU()
        self.output = torch.nn.Linear(in_features=hidden_dims, out_features=1)
        self.activation2 = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.dropout1(self.embedding(x.type(torch.LongTensor)))
        x = self.activation(self.conv1d(x.moveaxis(-1,-2)))
        x = self.pool(x).squeeze(-1)
        x = self.activation(self.dropout2(self.linear1(x)))
        x = self.activation2(self.output(x))
        return x

请注意,这是代码的翻译部分,没有包括其他内容。

英文:

Some observations:

  • Use BCEWithLogitsLoss as loss on the output of the last linear layer, before the sigmoid. This includes the sigmoid activation in a more numerically stable fashion.
  • The TensorFlow model has a ReLU after the Convolution, the pytorch implementations does not.

In general, for debugging, one might want to look at weight.grad of some of your weights after the loss.backward() and see if gradients calculated. Also printing out the value of one of the weights in each iteration to see if your optimizer actually changes the weights can help...

Also, it can depend on the input data:
(Are you sure that x_test is scaled correctly?)
If you are transforming your inputs to Long before embedding them and all x_test, for example, are floats between 0 and 1, they will all be converted to 0! And the network will have a hard time predicting the labels from all zeros as constant input!

But now to the actual issue in this particular case:
Be careful with .view! It might not do what you expect. It just reshapes the tensor but does not move the data around.
What you really want is .moveaxes(-1,2) instead!!

    	Loss	Accuracy	Val_Loss	Val_Acc
0	0.573489	0.671715	0.402601	0.819413
1	0.376908	0.830163	0.33786	    0.850783
2	0.308343	0.868646	0.296171	0.872323
3	0.258806	0.893342	0.319121	0.865849
4	0.227044	0.907649	0.3172  	0.868326
5	0.202789	0.918478	0.281184	0.886549
6	0.179744	0.928549	0.291027	0.886589
7	0.161205	0.93702	    0.329196	0.879156
8	0.145447	0.944094	0.294914	0.889746
9	0.133034	0.949568	0.291476	0.889826

After adding the relu after the convolution and, more importantly, fixing the view!

class MyModel(torch.nn.Module):
    def __init__(self, vocab_size=5000, input_len=400, embedding_dims=50, kernel_size=3, filters=250, hidden_dims=250):
        super(MyModel, self).__init__()
        self.embedding_dims = embedding_dims
        self.input_len = input_len
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dims)
        self.dropout1 = torch.nn.Dropout(p=0.2)
        self.conv1d = torch.nn.Conv1d(in_channels=embedding_dims, out_channels=filters, kernel_size=kernel_size, padding=(0,), stride=1)
        self.pool = torch.nn.AdaptiveMaxPool1d(1)
        self.linear1 = torch.nn.Linear(in_features=hidden_dims, out_features=hidden_dims)
        self.dropout2 = torch.nn.Dropout(p=0.2)
        self.activation = torch.nn.ReLU()
        self.output = torch.nn.Linear(in_features=hidden_dims, out_features=1)
        self.activation2 = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.dropout1(self.embedding(x.type(torch.LongTensor)))
        x = self.activation(self.conv1d(x.moveaxis(-1,-2)))
        x = self.pool(x).squeeze(-1)
        x = self.activation(self.dropout2(self.linear1(x)))
        x = self.activation2(self.output(x))
        return x

答案2

得分: 1

fit 函数中你用 tinymodel 初始化了 opt,如下所示:

opt = torch.optim.Adam(tinymodel.parameters(), lr=0.001)

看起来你的优化器没有作用在正确的模型上(参考这个回答,了解优化器和模型参数之间的关系)。

你需要在 fit 函数中替换以下这行代码:

def fit(self, train_dataloader, test_dataloader):
    opt = torch.optim.Adam(self.model.parameters(), lr=0.001)
    # ...

另外,你使用了在训练和测试时行为不同的 Dropout 层。你应该在 fitpredict_proba 函数的开头分别添加 self.model.train()self.model.eval()

英文:

What is tinymodel you init opt with in fit function:

opt = torch.optim.Adam(tinymodel.parameters(), lr=0.001)

It seems like your optimizer is not working on the right model (see this answer on the relation between the optimizer and the parameters of the model).

You need to replace this line in fit function:

        
    def fit(self, train_dataloader, test_dataloader):
        opt = torch.optim.Adam(self.model.parameters(), lr=0.001)
        # ...

Additionally, you are using Dropout layer that has different behavior in train and test.
You should add self.model.train() and self.model.eval() at the beginning of your fit and predict_proba functions respectively.

huangapple
  • 本文由 发表于 2023年2月6日 21:40:37
  • 转载请务必保留本文链接:https://go.coder-hub.com/75362059.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定