Transformers from scratch – shape '[1, 40, 64]' is invalid for input of size when passing input from encoder to decoder

huangapple go评论61阅读模式
英文:

Transformers from scratch - shape '[1, 40, 64]' is invalid for input of size when passing input from encoder to decoder

问题

这是尝试从头开始构建Transformer模型的代码。代码导入了必要的库并定义了一个包含编码器和解码器的Transformer模型。编码器和解码器由多个Transformer层组成。该Transformer模型使用Adam优化器和交叉熵损失函数进行训练。训练后的模型用于预测测试输入的输出。

代码还使用了transformers库中的BartTokenizer对输入和输出文本进行标记化。

关于报错信息"shape ' [1, 40, 64] ' is invalid for input of size 2048",这通常是由于模型的输入维度与数据的维度不匹配导致的。你可能需要检查以下几点:

  1. 确保输入数据的形状与模型期望的形状匹配。在这个报错中,模型似乎期望一个形状为[1, 40, 64]的输入,但实际输入的形状为2048。你需要确保输入数据的形状正确。

  2. 检查模型架构,特别是编码器和解码器的层。确保它们的维度设置正确,与输入数据的维度兼容。

  3. 检查数据预处理步骤,包括文本标记化和张量转换。确保它们按照预期进行,并且没有导致维度错误。

  4. 如果仍然遇到问题,可以尝试逐步调试代码,查找在哪个步骤导致了维度不匹配的问题。

希望这些提示能帮助你解决报错问题。如果需要进一步的帮助,请提供更多关于输入数据和模型架构的信息。

英文:

I'm trying to build a transformer model from scratch:

import torch
import torch.nn as nn
import torch.nn.functional as F

class TransformerEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_heads, ff_size, dropout):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.pos_embedding = nn.Embedding(1000, hidden_size)  # Positional embedding
        self.dropout = nn.Dropout(dropout)
        
        self.self_attention = nn.MultiheadAttention(hidden_size, num_heads)
        self.layer_norm1 = nn.LayerNorm(hidden_size)
        
        self.feed_forward = nn.Sequential(
            nn.Linear(hidden_size, ff_size),
            nn.ReLU(),
            nn.Linear(ff_size, hidden_size)
        )
        self.layer_norm2 = nn.LayerNorm(hidden_size)
        
        self.num_layers = num_layers

    def forward(self, input_seq):
        seq_len, batch_size = input_seq.size()

        # Embedding and positional encoding
        embedded = self.embedding(input_seq)
        pos_ids = torch.arange(seq_len, device=input_seq.device).unsqueeze(0).expand_as(input_seq)
        pos_embedded = self.pos_embedding(pos_ids)
        encoded = self.dropout(embedded + pos_embedded)

        # Transformer encoder layers
        for _ in range(self.num_layers):
            # Self-attention
            attention_output, _ = self.self_attention(encoded, encoded, encoded)
            attention_output = self.layer_norm1(encoded + self.dropout(attention_output))
            
            # Feed-forward
            ff_output = self.feed_forward(attention_output)
            ff_output = self.layer_norm2(attention_output + self.dropout(ff_output))

            encoded = ff_output

        return encoded


class TransformerDecoder(nn.Module):
    def __init__(self, output_size, hidden_size, num_layers, num_heads, ff_size, dropout):
        super(TransformerDecoder, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.pos_embedding = nn.Embedding(1000, hidden_size)  # Positional embedding
        self.dropout = nn.Dropout(dropout)
        
        self.self_attention = nn.MultiheadAttention(hidden_size, num_heads)
        self.layer_norm1 = nn.LayerNorm(hidden_size)
        
        self.encoder_attention = nn.MultiheadAttention(hidden_size, num_heads)
        self.layer_norm2 = nn.LayerNorm(hidden_size)
        
        self.feed_forward = nn.Sequential(
            nn.Linear(hidden_size, ff_size),
            nn.ReLU(),
            nn.Linear(ff_size, hidden_size)
        )
        self.layer_norm3 = nn.LayerNorm(hidden_size)
        
        self.fc = nn.Linear(hidden_size, output_size)

        self.num_layers = num_layers

    def forward(self, input_seq, encoder_output):
        seq_len, batch_size = input_seq.size()

        # Embedding and positional encoding
        embedded = self.embedding(input_seq)
        pos_ids = torch.arange(seq_len, device=input_seq.device).unsqueeze(0).expand_as(input_seq)
        pos_embedded = self.pos_embedding(pos_ids)
        encoded = self.dropout(embedded + pos_embedded)

        # Transformer decoder layers
        for _ in range(self.num_layers):
            # Self-attention
            self_attention_output, _ = self.self_attention(encoded, encoded, encoded)
            self_attention_output = self.layer_norm1(encoded + self.dropout(self_attention_output))
            
            # Encoder-decoder attention
            encoder_attention_output, _ = self.encoder_attention(self_attention_output, encoder_output, encoder_output)
            encoder_attention_output = self.layer_norm2(self_attention_output + self.dropout(encoder_attention_output))
            
            # Feed-forward
            ff_output = self.feed_forward(encoder_attention_output)
            ff_output = self.layer_norm3(encoder_attention_output + self.dropout(ff_output))

            encoded = ff_output

        output = self.fc(encoded)
        return output


class Transformer(nn.Module):
    def __init__(self, input_size, output_size, hidden_size, num_layers, num_heads, ff_size, dropout):
        super(Transformer, self).__init__()
        self.encoder = TransformerEncoder(input_size, hidden_size, num_layers, num_heads, ff_size, dropout)
        self.decoder = TransformerDecoder(output_size, hidden_size, num_layers, num_heads, ff_size, dropout)

    def forward(self, input_seq, target_seq):
        encoder_output = self.encoder(input_seq)
        output = self.decoder(target_seq, encoder_output)
        return output


# Example parameters
input_size = 50265  # Vocabulary size
output_size = 50265  # Vocabulary size
hidden_size = 256  # Hidden state size of the transformer layers
num_layers = 2  # Number of transformer layers
num_heads = 4  # Number of attention heads
ff_size = 1024  # Feed-forward layer size
dropout = 0.1  # Dropout rate

# Example training loop
input_text = "Hello, how are you?"
output_text = "I am doing well, thank you."

# Tokenization using Transformers
tokenizer = BartTokenizer.from_pretrained('bart-large-cnn')
tokenizer.add_tokens(["<custom_token_1>", "<custom_token_2>"])

# Tokenize input and output text
input_tokens = tokenizer.encode(input_text, add_special_tokens=True, truncation=True, padding=True)
output_tokens = tokenizer.encode(output_text, add_special_tokens=True, truncation=True, padding=True)

# Convert tokens to tensors
input_tensor = torch.tensor(input_tokens).unsqueeze(0)  # Add batch dimension
output_tensor = torch.tensor(output_tokens).unsqueeze(0)  # Add batch dimension

# Initialize the transformer model
model = Transformer(input_size, output_size, hidden_size, num_layers, num_heads, ff_size, dropout)

# Example training loop
num_epochs = 10
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    # Perform forward pass
    output = model(input_tensor, output_tensor)
    
    # Compute loss
    loss = F.cross_entropy(output.view(-1, output_size), output_tensor.view(-1))
    
    # Backpropagation and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # Print the loss for monitoring
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# Test the trained model with input text
test_input_text = "How's the weather today?"
test_input_tokens = tokenizer.encode(test_input_text, add_special_tokens=True, truncation=True, padding=True)
test_input_tensor = torch.tensor(test_input_tokens).unsqueeze(0)  # Add batch dimension

# Set the model to evaluation mode
model.eval()

# Perform forward pass with the test input
test_output = model(test_input_tensor, torch.zeros(1, 1).long())

# Get the predicted class
_, predicted_classes = torch.max(test_output, dim=2)
predicted_classes = predicted_classes.squeeze(0).tolist()

# Convert the predicted class tokens to text
predicted_text = tokenizer.decode(predicted_classes, skip_special_tokens=True)

# Print the predicted output text
print(f"Input: {test_input_text}")
print(f"Predicted Output: {predicted_text}")

The code imports the required libraries and defines a Transformer model with an Encoder and a Decoder. The Encoder and the Decoder consist of multiple transformer layers. The Transformer model is trained using the Adam optimizer and Cross-Entropy loss function. The trained model is then used to predict the output for a test input.

The code also uses the BartTokenizer from the transformers library to tokenize the input and output text.

Errors

The following errors are encountered while running the code:

RuntimeError: shape '[1, 40, 64]' is invalid for input of size 2048

答案1

得分: 1

Here's the translated code without any additional information:

from transformers import BartTokenizer

# Example training loop
input_text = "Hello, how are you?"
output_text = "I am doing well, thank you."

# Tokenization using Transformers
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
tokenizer.add_tokens(["<custom_token_1>", "<custom_token_2>"])

# Tokenize input and output text
input_tokens = tokenizer.encode(input_text, add_special_tokens=True, truncation=True, padding=True)
output_tokens = tokenizer.encode(output_text, add_special_tokens=True, truncation=True, padding=True)

# Convert tokens to tensors
input_tensor = torch.tensor(input_tokens).unsqueeze(0)  # Add batch dimension
output_tensor = torch.tensor(output_tokens).unsqueeze(0)  # Add batch dimension

print(input_tensor.shape, output_tensor.shape)

Here's the error part:

import torch
import torch.nn as nn
import torch.nn.functional as F

# ... (Rest of the code)

encoded = encoder(input_tensor)
decoder(output_tensor, encoded)

And the part where the error is explained and fixed:

input_ids, output_ids = tokenizer(
    [input_text, output_text], 
    add_special_tokens=True, truncation=True, padding=True, 
    pad_to_max_length=True
)['input_ids']

# Convert tokens to tensors
input_tensor = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension
output_tensor = torch.tensor(output_ids).unsqueeze(0)  # Add batch dimension

encoded = encoder(input_tensor)
decoder(output_tensor, encoded)

Let me know if you need anything else.

英文:

Lets try to walk through your code from the tokenizer:

from transformers import BartTokenizer

# Example training loop
input_text = &quot;Hello, how are you?&quot;
output_text = &quot;I am doing well, thank you.&quot;

# Tokenization using Transformers
tokenizer = BartTokenizer.from_pretrained(&#39;facebook/bart-large-cnn&#39;)
tokenizer.add_tokens([&quot;&lt;custom_token_1&gt;&quot;, &quot;&lt;custom_token_2&gt;&quot;])

# Tokenize input and output text
input_tokens = tokenizer.encode(input_text, add_special_tokens=True, truncation=True, padding=True)
output_tokens = tokenizer.encode(output_text, add_special_tokens=True, truncation=True, padding=True)

# Convert tokens to tensors
input_tensor = torch.tensor(input_tokens).unsqueeze(0)  # Add batch dimension
output_tensor = torch.tensor(output_tokens).unsqueeze(0)  # Add batch dimension

print(input_tensor.shape, output_tensor.shape)

[out]:

torch.Size([1, 8]) torch.Size([1, 10])

From here you kind of sense that might be problematic, the sizes are different for every data point and different for source and target too. You'll need to pad the inputs to make them the same size when the model sees it.

Check the Annotated Transformer code, http://nlp.seas.harvard.edu/annotated-transformer/, there is a collate_batch function that handles all these operations.


Next, try to put in the input/output into one pass through the model:

import torch
import torch.nn as nn
import torch.nn.functional as F

class TransformerEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_heads, ff_size, dropout):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.pos_embedding = nn.Embedding(1000, hidden_size)  # Positional embedding
        self.dropout = nn.Dropout(dropout)
        
        self.self_attention = nn.MultiheadAttention(hidden_size, num_heads)
        self.layer_norm1 = nn.LayerNorm(hidden_size)
        
        self.feed_forward = nn.Sequential(
            nn.Linear(hidden_size, ff_size),
            nn.ReLU(),
            nn.Linear(ff_size, hidden_size)
        )
        self.layer_norm2 = nn.LayerNorm(hidden_size)
        
        self.num_layers = num_layers

    def forward(self, input_seq):
        seq_len, batch_size = input_seq.size()

        # Embedding and positional encoding
        embedded = self.embedding(input_seq)
        pos_ids = torch.arange(seq_len, device=input_seq.device).unsqueeze(0).expand_as(input_seq)
        pos_embedded = self.pos_embedding(pos_ids)
        encoded = self.dropout(embedded + pos_embedded)

        # Transformer encoder layers
        for _ in range(self.num_layers):
            # Self-attention
            attention_output, _ = self.self_attention(encoded, encoded, encoded)
            attention_output = self.layer_norm1(encoded + self.dropout(attention_output))
            
            # Feed-forward
            ff_output = self.feed_forward(attention_output)
            ff_output = self.layer_norm2(attention_output + self.dropout(ff_output))

            encoded = ff_output

        return encoded
    

class TransformerDecoder(nn.Module):
    def __init__(self, output_size, hidden_size, num_layers, num_heads, ff_size, dropout):
        super(TransformerDecoder, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.pos_embedding = nn.Embedding(1000, hidden_size)  # Positional embedding
        self.dropout = nn.Dropout(dropout)
        
        self.self_attention = nn.MultiheadAttention(hidden_size, num_heads)
        self.layer_norm1 = nn.LayerNorm(hidden_size)
        
        self.encoder_attention = nn.MultiheadAttention(hidden_size, num_heads)
        self.layer_norm2 = nn.LayerNorm(hidden_size)
        
        self.feed_forward = nn.Sequential(
            nn.Linear(hidden_size, ff_size),
            nn.ReLU(),
            nn.Linear(ff_size, hidden_size)
        )
        self.layer_norm3 = nn.LayerNorm(hidden_size)
        
        self.fc = nn.Linear(hidden_size, output_size)

        self.num_layers = num_layers

    def forward(self, input_seq, encoder_output):
        seq_len, batch_size = input_seq.size()

        # Embedding and positional encoding
        embedded = self.embedding(input_seq)
        pos_ids = torch.arange(seq_len, device=input_seq.device).unsqueeze(0).expand_as(input_seq)
        pos_embedded = self.pos_embedding(pos_ids)
        encoded = self.dropout(embedded + pos_embedded)

        # Transformer decoder layers
        for _ in range(self.num_layers):
            # Self-attention
            self_attention_output, _ = self.self_attention(encoded, encoded, encoded)
            self_attention_output = self.layer_norm1(encoded + self.dropout(self_attention_output))
            
            # Encoder-decoder attention
            encoder_attention_output, _ = self.encoder_attention(self_attention_output, encoder_output, encoder_output)
            encoder_attention_output = self.layer_norm2(self_attention_output + self.dropout(encoder_attention_output))
            
            # Feed-forward
            ff_output = self.feed_forward(encoder_attention_output)
            ff_output = self.layer_norm3(encoder_attention_output + self.dropout(ff_output))

            encoded = ff_output

        output = self.fc(encoded)
        return output


# Example parameters
input_size = 50265  # Vocabulary size
output_size = 50265  # Vocabulary size
hidden_size = 256  # Hidden state size of the transformer layers
num_layers = 2  # Number of transformer layers
num_heads = 4  # Number of attention heads
ff_size = 1024  # Feed-forward layer size
dropout = 0.1  # Dropout rate


encoder = TransformerEncoder(input_size, hidden_size, num_layers, num_heads, ff_size, dropout)
decoder = TransformerDecoder(output_size, hidden_size, num_layers, num_heads, ff_size, dropout)

encoded = encoder(input_tensor)
decoder(output_tensor, encoded)

And you will see this error:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[23], line 1
----&gt; 1 decoder(output_tensor, encoded)

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
   1496 # If we don&#39;t have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-&gt; 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

Cell In[20], line 88, in TransformerDecoder.forward(self, input_seq, encoder_output)
     85 self_attention_output = self.layer_norm1(encoded + self.dropout(self_attention_output))
     87 # Encoder-decoder attention
---&gt; 88 encoder_attention_output, _ = self.encoder_attention(self_attention_output, encoder_output, encoder_output)
     89 encoder_attention_output = self.layer_norm2(self_attention_output + self.dropout(encoder_attention_output))
     91 # Feed-forward

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
   1496 # If we don&#39;t have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-&gt; 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/activation.py:1189, in MultiheadAttention.forward(self, query, key, value, key_padding_mask, need_weights, attn_mask, average_attn_weights, is_causal)
   1175     attn_output, attn_output_weights = F.multi_head_attention_forward(
   1176         query, key, value, self.embed_dim, self.num_heads,
   1177         self.in_proj_weight, self.in_proj_bias,
   (...)
   1186         average_attn_weights=average_attn_weights,
   1187         is_causal=is_causal)
   1188 else:
-&gt; 1189     attn_output, attn_output_weights = F.multi_head_attention_forward(
   1190         query, key, value, self.embed_dim, self.num_heads,
   1191         self.in_proj_weight, self.in_proj_bias,
   1192         self.bias_k, self.bias_v, self.add_zero_attn,
   1193         self.dropout, self.out_proj.weight, self.out_proj.bias,
   1194         training=self.training,
   1195         key_padding_mask=key_padding_mask,
   1196         need_weights=need_weights,
   1197         attn_mask=attn_mask,
   1198         average_attn_weights=average_attn_weights,
   1199         is_causal=is_causal)
   1200 if self.batch_first and is_batched:
   1201     return attn_output.transpose(1, 0), attn_output_weights

File /opt/conda/lib/python3.10/site-packages/torch/nn/functional.py:5243, in multi_head_attention_forward(query, key, value, embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias, bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight, out_proj_bias, training, key_padding_mask, need_weights, attn_mask, use_separate_proj_weight, q_proj_weight, k_proj_weight, v_proj_weight, static_k, static_v, average_attn_weights, is_causal)
   5241 q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
   5242 if static_k is None:
-&gt; 5243     k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
   5244 else:
   5245     # TODO finish disentangling control flow so we don&#39;t do in-projections when statics are passed
   5246     assert static_k.size(0) == bsz * num_heads, \
   5247         f&quot;expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}&quot;

RuntimeError: shape &#39;[1, 40, 64]&#39; is invalid for input of size 2048

Now that we establish that the error comes from multi_head_attention_forward, we need to ask what does that mean?

Q: When doing attention, does the dot products of the input and output tensor need to be the same size?

A: https://www.google.com/search?q=dot+product+need+to+be+the+same+siz

> The dot product of these two vectors is the sum of the products of elements at each position. In this case, the dot product is (12)+(24)+(3*6) . Since we multiply elements at the same positions, the two vectors must have the same length in order to have a dot product.

Now lets try to encode and decode the same input to input

encoded = encoder(input_tensor)
decoder(input_tensor, encoded)

[out]:

tensor([[[-0.2148,  0.1838,  1.0957,  ...,  0.0339,  0.5636,  0.0724],
         [-0.3618,  0.2255, -0.0054,  ..., -0.8457,  0.7408, -0.2420],
         [-0.4362, -0.1221, -0.1487,  ..., -1.0708,  0.2463,  0.3195],
         ...,
         [-0.4172,  0.4385,  0.0669,  ..., -0.4320,  0.7453,  0.4359],
         [-0.7655,  0.2335, -0.0070,  ..., -0.2925, -0.0765, -0.0452],
         [ 0.1498,  0.4205,  0.2231,  ..., -0.6602, -0.4533,  0.1439]]],
       grad_fn=&lt;ViewBackward0&gt;)

It works!

But I want it to translate input to output, not copy the input!

A: Try padding your input and output tensors

input_ids, output_ids = tokenizer(
    [input_text, output_text], 
    add_special_tokens=True, truncation=True, padding=True, 
    pad_to_max_length=True
)[&#39;input_ids&#39;]

print(input_ids, output_ids)

[out]:

([0, 31414, 6, 141, 32, 47, 116, 2, 1, 1],
 [0, 100, 524, 608, 157, 6, 3392, 47, 4, 2])

Now the model should pass through properly with the inputs padded with id 1.


input_ids, output_ids = tokenizer(
    [input_text, output_text], 
    add_special_tokens=True, truncation=True, padding=True, 
    pad_to_max_length=True
)[&#39;input_ids&#39;]

# Convert tokens to tensors
input_tensor = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension
output_tensor = torch.tensor(output_ids).unsqueeze(0)  # Add batch dimension

encoded = encoder(torch.tensor(input_tensor))
decoder(torch.tensor(output_tensor), encoded)

[out]:

tensor([[[-0.2411,  0.0112,  0.7898,  ...,  0.0268,  0.5292,  0.3200],
         [-0.4188,  0.2833, -0.0789,  ..., -0.4128,  0.5024, -0.2907],
         [-1.0449, -0.9079, -0.3526,  ..., -0.5538,  0.7915,  0.8056],
         ...,
         [-0.5720,  0.9486,  0.4666,  ..., -0.2160,  0.1702,  0.4758],
         [-0.1059,  0.2155,  0.8135,  ..., -0.8385,  0.4577,  0.0031],
         [-0.0573,  0.7191,  0.0519,  ..., -0.2227, -0.1204, -0.3053]]],
       grad_fn=&lt;ViewBackward0&gt;)

huangapple
  • 本文由 发表于 2023年5月22日 06:54:28
  • 转载请务必保留本文链接:https://go.coder-hub.com/76302243.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定