英文:
Transformers from scratch - shape '[1, 40, 64]' is invalid for input of size when passing input from encoder to decoder
问题
这是尝试从头开始构建Transformer模型的代码。代码导入了必要的库并定义了一个包含编码器和解码器的Transformer模型。编码器和解码器由多个Transformer层组成。该Transformer模型使用Adam优化器和交叉熵损失函数进行训练。训练后的模型用于预测测试输入的输出。
代码还使用了transformers库中的BartTokenizer对输入和输出文本进行标记化。
关于报错信息"shape ' [1, 40, 64] ' is invalid for input of size 2048",这通常是由于模型的输入维度与数据的维度不匹配导致的。你可能需要检查以下几点:
-
确保输入数据的形状与模型期望的形状匹配。在这个报错中,模型似乎期望一个形状为[1, 40, 64]的输入,但实际输入的形状为2048。你需要确保输入数据的形状正确。
-
检查模型架构,特别是编码器和解码器的层。确保它们的维度设置正确,与输入数据的维度兼容。
-
检查数据预处理步骤,包括文本标记化和张量转换。确保它们按照预期进行,并且没有导致维度错误。
-
如果仍然遇到问题,可以尝试逐步调试代码,查找在哪个步骤导致了维度不匹配的问题。
希望这些提示能帮助你解决报错问题。如果需要进一步的帮助,请提供更多关于输入数据和模型架构的信息。
英文:
I'm trying to build a transformer model from scratch:
import torch
import torch.nn as nn
import torch.nn.functional as F
class TransformerEncoder(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_heads, ff_size, dropout):
super(TransformerEncoder, self).__init__()
self.embedding = nn.Embedding(input_size, hidden_size)
self.pos_embedding = nn.Embedding(1000, hidden_size) # Positional embedding
self.dropout = nn.Dropout(dropout)
self.self_attention = nn.MultiheadAttention(hidden_size, num_heads)
self.layer_norm1 = nn.LayerNorm(hidden_size)
self.feed_forward = nn.Sequential(
nn.Linear(hidden_size, ff_size),
nn.ReLU(),
nn.Linear(ff_size, hidden_size)
)
self.layer_norm2 = nn.LayerNorm(hidden_size)
self.num_layers = num_layers
def forward(self, input_seq):
seq_len, batch_size = input_seq.size()
# Embedding and positional encoding
embedded = self.embedding(input_seq)
pos_ids = torch.arange(seq_len, device=input_seq.device).unsqueeze(0).expand_as(input_seq)
pos_embedded = self.pos_embedding(pos_ids)
encoded = self.dropout(embedded + pos_embedded)
# Transformer encoder layers
for _ in range(self.num_layers):
# Self-attention
attention_output, _ = self.self_attention(encoded, encoded, encoded)
attention_output = self.layer_norm1(encoded + self.dropout(attention_output))
# Feed-forward
ff_output = self.feed_forward(attention_output)
ff_output = self.layer_norm2(attention_output + self.dropout(ff_output))
encoded = ff_output
return encoded
class TransformerDecoder(nn.Module):
def __init__(self, output_size, hidden_size, num_layers, num_heads, ff_size, dropout):
super(TransformerDecoder, self).__init__()
self.embedding = nn.Embedding(output_size, hidden_size)
self.pos_embedding = nn.Embedding(1000, hidden_size) # Positional embedding
self.dropout = nn.Dropout(dropout)
self.self_attention = nn.MultiheadAttention(hidden_size, num_heads)
self.layer_norm1 = nn.LayerNorm(hidden_size)
self.encoder_attention = nn.MultiheadAttention(hidden_size, num_heads)
self.layer_norm2 = nn.LayerNorm(hidden_size)
self.feed_forward = nn.Sequential(
nn.Linear(hidden_size, ff_size),
nn.ReLU(),
nn.Linear(ff_size, hidden_size)
)
self.layer_norm3 = nn.LayerNorm(hidden_size)
self.fc = nn.Linear(hidden_size, output_size)
self.num_layers = num_layers
def forward(self, input_seq, encoder_output):
seq_len, batch_size = input_seq.size()
# Embedding and positional encoding
embedded = self.embedding(input_seq)
pos_ids = torch.arange(seq_len, device=input_seq.device).unsqueeze(0).expand_as(input_seq)
pos_embedded = self.pos_embedding(pos_ids)
encoded = self.dropout(embedded + pos_embedded)
# Transformer decoder layers
for _ in range(self.num_layers):
# Self-attention
self_attention_output, _ = self.self_attention(encoded, encoded, encoded)
self_attention_output = self.layer_norm1(encoded + self.dropout(self_attention_output))
# Encoder-decoder attention
encoder_attention_output, _ = self.encoder_attention(self_attention_output, encoder_output, encoder_output)
encoder_attention_output = self.layer_norm2(self_attention_output + self.dropout(encoder_attention_output))
# Feed-forward
ff_output = self.feed_forward(encoder_attention_output)
ff_output = self.layer_norm3(encoder_attention_output + self.dropout(ff_output))
encoded = ff_output
output = self.fc(encoded)
return output
class Transformer(nn.Module):
def __init__(self, input_size, output_size, hidden_size, num_layers, num_heads, ff_size, dropout):
super(Transformer, self).__init__()
self.encoder = TransformerEncoder(input_size, hidden_size, num_layers, num_heads, ff_size, dropout)
self.decoder = TransformerDecoder(output_size, hidden_size, num_layers, num_heads, ff_size, dropout)
def forward(self, input_seq, target_seq):
encoder_output = self.encoder(input_seq)
output = self.decoder(target_seq, encoder_output)
return output
# Example parameters
input_size = 50265 # Vocabulary size
output_size = 50265 # Vocabulary size
hidden_size = 256 # Hidden state size of the transformer layers
num_layers = 2 # Number of transformer layers
num_heads = 4 # Number of attention heads
ff_size = 1024 # Feed-forward layer size
dropout = 0.1 # Dropout rate
# Example training loop
input_text = "Hello, how are you?"
output_text = "I am doing well, thank you."
# Tokenization using Transformers
tokenizer = BartTokenizer.from_pretrained('bart-large-cnn')
tokenizer.add_tokens(["<custom_token_1>", "<custom_token_2>"])
# Tokenize input and output text
input_tokens = tokenizer.encode(input_text, add_special_tokens=True, truncation=True, padding=True)
output_tokens = tokenizer.encode(output_text, add_special_tokens=True, truncation=True, padding=True)
# Convert tokens to tensors
input_tensor = torch.tensor(input_tokens).unsqueeze(0) # Add batch dimension
output_tensor = torch.tensor(output_tokens).unsqueeze(0) # Add batch dimension
# Initialize the transformer model
model = Transformer(input_size, output_size, hidden_size, num_layers, num_heads, ff_size, dropout)
# Example training loop
num_epochs = 10
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
# Perform forward pass
output = model(input_tensor, output_tensor)
# Compute loss
loss = F.cross_entropy(output.view(-1, output_size), output_tensor.view(-1))
# Backpropagation and optimization
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Print the loss for monitoring
print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")
# Test the trained model with input text
test_input_text = "How's the weather today?"
test_input_tokens = tokenizer.encode(test_input_text, add_special_tokens=True, truncation=True, padding=True)
test_input_tensor = torch.tensor(test_input_tokens).unsqueeze(0) # Add batch dimension
# Set the model to evaluation mode
model.eval()
# Perform forward pass with the test input
test_output = model(test_input_tensor, torch.zeros(1, 1).long())
# Get the predicted class
_, predicted_classes = torch.max(test_output, dim=2)
predicted_classes = predicted_classes.squeeze(0).tolist()
# Convert the predicted class tokens to text
predicted_text = tokenizer.decode(predicted_classes, skip_special_tokens=True)
# Print the predicted output text
print(f"Input: {test_input_text}")
print(f"Predicted Output: {predicted_text}")
The code imports the required libraries and defines a Transformer model with an Encoder and a Decoder. The Encoder and the Decoder consist of multiple transformer layers. The Transformer model is trained using the Adam optimizer and Cross-Entropy loss function. The trained model is then used to predict the output for a test input.
The code also uses the BartTokenizer
from the transformers library to tokenize the input and output text.
Errors
The following errors are encountered while running the code:
RuntimeError: shape '[1, 40, 64]' is invalid for input of size 2048
答案1
得分: 1
Here's the translated code without any additional information:
from transformers import BartTokenizer
# Example training loop
input_text = "Hello, how are you?"
output_text = "I am doing well, thank you."
# Tokenization using Transformers
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
tokenizer.add_tokens(["<custom_token_1>", "<custom_token_2>"])
# Tokenize input and output text
input_tokens = tokenizer.encode(input_text, add_special_tokens=True, truncation=True, padding=True)
output_tokens = tokenizer.encode(output_text, add_special_tokens=True, truncation=True, padding=True)
# Convert tokens to tensors
input_tensor = torch.tensor(input_tokens).unsqueeze(0) # Add batch dimension
output_tensor = torch.tensor(output_tokens).unsqueeze(0) # Add batch dimension
print(input_tensor.shape, output_tensor.shape)
Here's the error part:
import torch
import torch.nn as nn
import torch.nn.functional as F
# ... (Rest of the code)
encoded = encoder(input_tensor)
decoder(output_tensor, encoded)
And the part where the error is explained and fixed:
input_ids, output_ids = tokenizer(
[input_text, output_text],
add_special_tokens=True, truncation=True, padding=True,
pad_to_max_length=True
)['input_ids']
# Convert tokens to tensors
input_tensor = torch.tensor(input_ids).unsqueeze(0) # Add batch dimension
output_tensor = torch.tensor(output_ids).unsqueeze(0) # Add batch dimension
encoded = encoder(input_tensor)
decoder(output_tensor, encoded)
Let me know if you need anything else.
英文:
Lets try to walk through your code from the tokenizer:
from transformers import BartTokenizer
# Example training loop
input_text = "Hello, how are you?"
output_text = "I am doing well, thank you."
# Tokenization using Transformers
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
tokenizer.add_tokens(["<custom_token_1>", "<custom_token_2>"])
# Tokenize input and output text
input_tokens = tokenizer.encode(input_text, add_special_tokens=True, truncation=True, padding=True)
output_tokens = tokenizer.encode(output_text, add_special_tokens=True, truncation=True, padding=True)
# Convert tokens to tensors
input_tensor = torch.tensor(input_tokens).unsqueeze(0) # Add batch dimension
output_tensor = torch.tensor(output_tokens).unsqueeze(0) # Add batch dimension
print(input_tensor.shape, output_tensor.shape)
[out]:
torch.Size([1, 8]) torch.Size([1, 10])
From here you kind of sense that might be problematic, the sizes are different for every data point and different for source and target too. You'll need to pad the inputs to make them the same size when the model sees it.
Check the Annotated Transformer code, http://nlp.seas.harvard.edu/annotated-transformer/, there is a collate_batch
function that handles all these operations.
Next, try to put in the input/output into one pass through the model:
import torch
import torch.nn as nn
import torch.nn.functional as F
class TransformerEncoder(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_heads, ff_size, dropout):
super(TransformerEncoder, self).__init__()
self.embedding = nn.Embedding(input_size, hidden_size)
self.pos_embedding = nn.Embedding(1000, hidden_size) # Positional embedding
self.dropout = nn.Dropout(dropout)
self.self_attention = nn.MultiheadAttention(hidden_size, num_heads)
self.layer_norm1 = nn.LayerNorm(hidden_size)
self.feed_forward = nn.Sequential(
nn.Linear(hidden_size, ff_size),
nn.ReLU(),
nn.Linear(ff_size, hidden_size)
)
self.layer_norm2 = nn.LayerNorm(hidden_size)
self.num_layers = num_layers
def forward(self, input_seq):
seq_len, batch_size = input_seq.size()
# Embedding and positional encoding
embedded = self.embedding(input_seq)
pos_ids = torch.arange(seq_len, device=input_seq.device).unsqueeze(0).expand_as(input_seq)
pos_embedded = self.pos_embedding(pos_ids)
encoded = self.dropout(embedded + pos_embedded)
# Transformer encoder layers
for _ in range(self.num_layers):
# Self-attention
attention_output, _ = self.self_attention(encoded, encoded, encoded)
attention_output = self.layer_norm1(encoded + self.dropout(attention_output))
# Feed-forward
ff_output = self.feed_forward(attention_output)
ff_output = self.layer_norm2(attention_output + self.dropout(ff_output))
encoded = ff_output
return encoded
class TransformerDecoder(nn.Module):
def __init__(self, output_size, hidden_size, num_layers, num_heads, ff_size, dropout):
super(TransformerDecoder, self).__init__()
self.embedding = nn.Embedding(output_size, hidden_size)
self.pos_embedding = nn.Embedding(1000, hidden_size) # Positional embedding
self.dropout = nn.Dropout(dropout)
self.self_attention = nn.MultiheadAttention(hidden_size, num_heads)
self.layer_norm1 = nn.LayerNorm(hidden_size)
self.encoder_attention = nn.MultiheadAttention(hidden_size, num_heads)
self.layer_norm2 = nn.LayerNorm(hidden_size)
self.feed_forward = nn.Sequential(
nn.Linear(hidden_size, ff_size),
nn.ReLU(),
nn.Linear(ff_size, hidden_size)
)
self.layer_norm3 = nn.LayerNorm(hidden_size)
self.fc = nn.Linear(hidden_size, output_size)
self.num_layers = num_layers
def forward(self, input_seq, encoder_output):
seq_len, batch_size = input_seq.size()
# Embedding and positional encoding
embedded = self.embedding(input_seq)
pos_ids = torch.arange(seq_len, device=input_seq.device).unsqueeze(0).expand_as(input_seq)
pos_embedded = self.pos_embedding(pos_ids)
encoded = self.dropout(embedded + pos_embedded)
# Transformer decoder layers
for _ in range(self.num_layers):
# Self-attention
self_attention_output, _ = self.self_attention(encoded, encoded, encoded)
self_attention_output = self.layer_norm1(encoded + self.dropout(self_attention_output))
# Encoder-decoder attention
encoder_attention_output, _ = self.encoder_attention(self_attention_output, encoder_output, encoder_output)
encoder_attention_output = self.layer_norm2(self_attention_output + self.dropout(encoder_attention_output))
# Feed-forward
ff_output = self.feed_forward(encoder_attention_output)
ff_output = self.layer_norm3(encoder_attention_output + self.dropout(ff_output))
encoded = ff_output
output = self.fc(encoded)
return output
# Example parameters
input_size = 50265 # Vocabulary size
output_size = 50265 # Vocabulary size
hidden_size = 256 # Hidden state size of the transformer layers
num_layers = 2 # Number of transformer layers
num_heads = 4 # Number of attention heads
ff_size = 1024 # Feed-forward layer size
dropout = 0.1 # Dropout rate
encoder = TransformerEncoder(input_size, hidden_size, num_layers, num_heads, ff_size, dropout)
decoder = TransformerDecoder(output_size, hidden_size, num_layers, num_heads, ff_size, dropout)
encoded = encoder(input_tensor)
decoder(output_tensor, encoded)
And you will see this error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[23], line 1
----> 1 decoder(output_tensor, encoded)
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
Cell In[20], line 88, in TransformerDecoder.forward(self, input_seq, encoder_output)
85 self_attention_output = self.layer_norm1(encoded + self.dropout(self_attention_output))
87 # Encoder-decoder attention
---> 88 encoder_attention_output, _ = self.encoder_attention(self_attention_output, encoder_output, encoder_output)
89 encoder_attention_output = self.layer_norm2(self_attention_output + self.dropout(encoder_attention_output))
91 # Feed-forward
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/activation.py:1189, in MultiheadAttention.forward(self, query, key, value, key_padding_mask, need_weights, attn_mask, average_attn_weights, is_causal)
1175 attn_output, attn_output_weights = F.multi_head_attention_forward(
1176 query, key, value, self.embed_dim, self.num_heads,
1177 self.in_proj_weight, self.in_proj_bias,
(...)
1186 average_attn_weights=average_attn_weights,
1187 is_causal=is_causal)
1188 else:
-> 1189 attn_output, attn_output_weights = F.multi_head_attention_forward(
1190 query, key, value, self.embed_dim, self.num_heads,
1191 self.in_proj_weight, self.in_proj_bias,
1192 self.bias_k, self.bias_v, self.add_zero_attn,
1193 self.dropout, self.out_proj.weight, self.out_proj.bias,
1194 training=self.training,
1195 key_padding_mask=key_padding_mask,
1196 need_weights=need_weights,
1197 attn_mask=attn_mask,
1198 average_attn_weights=average_attn_weights,
1199 is_causal=is_causal)
1200 if self.batch_first and is_batched:
1201 return attn_output.transpose(1, 0), attn_output_weights
File /opt/conda/lib/python3.10/site-packages/torch/nn/functional.py:5243, in multi_head_attention_forward(query, key, value, embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias, bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight, out_proj_bias, training, key_padding_mask, need_weights, attn_mask, use_separate_proj_weight, q_proj_weight, k_proj_weight, v_proj_weight, static_k, static_v, average_attn_weights, is_causal)
5241 q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
5242 if static_k is None:
-> 5243 k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
5244 else:
5245 # TODO finish disentangling control flow so we don't do in-projections when statics are passed
5246 assert static_k.size(0) == bsz * num_heads, \
5247 f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}"
RuntimeError: shape '[1, 40, 64]' is invalid for input of size 2048
Now that we establish that the error comes from multi_head_attention_forward
, we need to ask what does that mean?
Q: When doing attention, does the dot products of the input and output tensor need to be the same size?
A: https://www.google.com/search?q=dot+product+need+to+be+the+same+siz
> The dot product of these two vectors is the sum of the products of elements at each position. In this case, the dot product is (12)+(24)+(3*6) . Since we multiply elements at the same positions, the two vectors must have the same length in order to have a dot product.
Now lets try to encode and decode the same input to input
encoded = encoder(input_tensor)
decoder(input_tensor, encoded)
[out]:
tensor([[[-0.2148, 0.1838, 1.0957, ..., 0.0339, 0.5636, 0.0724],
[-0.3618, 0.2255, -0.0054, ..., -0.8457, 0.7408, -0.2420],
[-0.4362, -0.1221, -0.1487, ..., -1.0708, 0.2463, 0.3195],
...,
[-0.4172, 0.4385, 0.0669, ..., -0.4320, 0.7453, 0.4359],
[-0.7655, 0.2335, -0.0070, ..., -0.2925, -0.0765, -0.0452],
[ 0.1498, 0.4205, 0.2231, ..., -0.6602, -0.4533, 0.1439]]],
grad_fn=<ViewBackward0>)
It works!
But I want it to translate input to output, not copy the input!
A: Try padding your input and output tensors
input_ids, output_ids = tokenizer(
[input_text, output_text],
add_special_tokens=True, truncation=True, padding=True,
pad_to_max_length=True
)['input_ids']
print(input_ids, output_ids)
[out]:
([0, 31414, 6, 141, 32, 47, 116, 2, 1, 1],
[0, 100, 524, 608, 157, 6, 3392, 47, 4, 2])
Now the model should pass through properly with the inputs padded with id 1
.
input_ids, output_ids = tokenizer(
[input_text, output_text],
add_special_tokens=True, truncation=True, padding=True,
pad_to_max_length=True
)['input_ids']
# Convert tokens to tensors
input_tensor = torch.tensor(input_ids).unsqueeze(0) # Add batch dimension
output_tensor = torch.tensor(output_ids).unsqueeze(0) # Add batch dimension
encoded = encoder(torch.tensor(input_tensor))
decoder(torch.tensor(output_tensor), encoded)
[out]:
tensor([[[-0.2411, 0.0112, 0.7898, ..., 0.0268, 0.5292, 0.3200],
[-0.4188, 0.2833, -0.0789, ..., -0.4128, 0.5024, -0.2907],
[-1.0449, -0.9079, -0.3526, ..., -0.5538, 0.7915, 0.8056],
...,
[-0.5720, 0.9486, 0.4666, ..., -0.2160, 0.1702, 0.4758],
[-0.1059, 0.2155, 0.8135, ..., -0.8385, 0.4577, 0.0031],
[-0.0573, 0.7191, 0.0519, ..., -0.2227, -0.1204, -0.3053]]],
grad_fn=<ViewBackward0>)
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论