Transformers from scratch – shape '[1, 40, 64]' is invalid for input of size when passing input from encoder to decoder

huangapple go评论90阅读模式

Transformers from scratch - shape '[1, 40, 64]' is invalid for input of size when passing input from encoder to decoder




关于报错信息"shape ' [1, 40, 64] ' is invalid for input of size 2048",这通常是由于模型的输入维度与数据的维度不匹配导致的。你可能需要检查以下几点:

  1. 确保输入数据的形状与模型期望的形状匹配。在这个报错中,模型似乎期望一个形状为[1, 40, 64]的输入,但实际输入的形状为2048。你需要确保输入数据的形状正确。

  2. 检查模型架构,特别是编码器和解码器的层。确保它们的维度设置正确,与输入数据的维度兼容。

  3. 检查数据预处理步骤,包括文本标记化和张量转换。确保它们按照预期进行,并且没有导致维度错误。

  4. 如果仍然遇到问题,可以尝试逐步调试代码,查找在哪个步骤导致了维度不匹配的问题。



I'm trying to build a transformer model from scratch:

  1. import torch
  2. import torch.nn as nn
  3. import torch.nn.functional as F
  4. class TransformerEncoder(nn.Module):
  5. def __init__(self, input_size, hidden_size, num_layers, num_heads, ff_size, dropout):
  6. super(TransformerEncoder, self).__init__()
  7. self.embedding = nn.Embedding(input_size, hidden_size)
  8. self.pos_embedding = nn.Embedding(1000, hidden_size) # Positional embedding
  9. self.dropout = nn.Dropout(dropout)
  10. self.self_attention = nn.MultiheadAttention(hidden_size, num_heads)
  11. self.layer_norm1 = nn.LayerNorm(hidden_size)
  12. self.feed_forward = nn.Sequential(
  13. nn.Linear(hidden_size, ff_size),
  14. nn.ReLU(),
  15. nn.Linear(ff_size, hidden_size)
  16. )
  17. self.layer_norm2 = nn.LayerNorm(hidden_size)
  18. self.num_layers = num_layers
  19. def forward(self, input_seq):
  20. seq_len, batch_size = input_seq.size()
  21. # Embedding and positional encoding
  22. embedded = self.embedding(input_seq)
  23. pos_ids = torch.arange(seq_len, device=input_seq.device).unsqueeze(0).expand_as(input_seq)
  24. pos_embedded = self.pos_embedding(pos_ids)
  25. encoded = self.dropout(embedded + pos_embedded)
  26. # Transformer encoder layers
  27. for _ in range(self.num_layers):
  28. # Self-attention
  29. attention_output, _ = self.self_attention(encoded, encoded, encoded)
  30. attention_output = self.layer_norm1(encoded + self.dropout(attention_output))
  31. # Feed-forward
  32. ff_output = self.feed_forward(attention_output)
  33. ff_output = self.layer_norm2(attention_output + self.dropout(ff_output))
  34. encoded = ff_output
  35. return encoded
  36. class TransformerDecoder(nn.Module):
  37. def __init__(self, output_size, hidden_size, num_layers, num_heads, ff_size, dropout):
  38. super(TransformerDecoder, self).__init__()
  39. self.embedding = nn.Embedding(output_size, hidden_size)
  40. self.pos_embedding = nn.Embedding(1000, hidden_size) # Positional embedding
  41. self.dropout = nn.Dropout(dropout)
  42. self.self_attention = nn.MultiheadAttention(hidden_size, num_heads)
  43. self.layer_norm1 = nn.LayerNorm(hidden_size)
  44. self.encoder_attention = nn.MultiheadAttention(hidden_size, num_heads)
  45. self.layer_norm2 = nn.LayerNorm(hidden_size)
  46. self.feed_forward = nn.Sequential(
  47. nn.Linear(hidden_size, ff_size),
  48. nn.ReLU(),
  49. nn.Linear(ff_size, hidden_size)
  50. )
  51. self.layer_norm3 = nn.LayerNorm(hidden_size)
  52. self.fc = nn.Linear(hidden_size, output_size)
  53. self.num_layers = num_layers
  54. def forward(self, input_seq, encoder_output):
  55. seq_len, batch_size = input_seq.size()
  56. # Embedding and positional encoding
  57. embedded = self.embedding(input_seq)
  58. pos_ids = torch.arange(seq_len, device=input_seq.device).unsqueeze(0).expand_as(input_seq)
  59. pos_embedded = self.pos_embedding(pos_ids)
  60. encoded = self.dropout(embedded + pos_embedded)
  61. # Transformer decoder layers
  62. for _ in range(self.num_layers):
  63. # Self-attention
  64. self_attention_output, _ = self.self_attention(encoded, encoded, encoded)
  65. self_attention_output = self.layer_norm1(encoded + self.dropout(self_attention_output))
  66. # Encoder-decoder attention
  67. encoder_attention_output, _ = self.encoder_attention(self_attention_output, encoder_output, encoder_output)
  68. encoder_attention_output = self.layer_norm2(self_attention_output + self.dropout(encoder_attention_output))
  69. # Feed-forward
  70. ff_output = self.feed_forward(encoder_attention_output)
  71. ff_output = self.layer_norm3(encoder_attention_output + self.dropout(ff_output))
  72. encoded = ff_output
  73. output = self.fc(encoded)
  74. return output
  75. class Transformer(nn.Module):
  76. def __init__(self, input_size, output_size, hidden_size, num_layers, num_heads, ff_size, dropout):
  77. super(Transformer, self).__init__()
  78. self.encoder = TransformerEncoder(input_size, hidden_size, num_layers, num_heads, ff_size, dropout)
  79. self.decoder = TransformerDecoder(output_size, hidden_size, num_layers, num_heads, ff_size, dropout)
  80. def forward(self, input_seq, target_seq):
  81. encoder_output = self.encoder(input_seq)
  82. output = self.decoder(target_seq, encoder_output)
  83. return output
  84. # Example parameters
  85. input_size = 50265 # Vocabulary size
  86. output_size = 50265 # Vocabulary size
  87. hidden_size = 256 # Hidden state size of the transformer layers
  88. num_layers = 2 # Number of transformer layers
  89. num_heads = 4 # Number of attention heads
  90. ff_size = 1024 # Feed-forward layer size
  91. dropout = 0.1 # Dropout rate
  92. # Example training loop
  93. input_text = "Hello, how are you?"
  94. output_text = "I am doing well, thank you."
  95. # Tokenization using Transformers
  96. tokenizer = BartTokenizer.from_pretrained('bart-large-cnn')
  97. tokenizer.add_tokens(["<custom_token_1>", "<custom_token_2>"])
  98. # Tokenize input and output text
  99. input_tokens = tokenizer.encode(input_text, add_special_tokens=True, truncation=True, padding=True)
  100. output_tokens = tokenizer.encode(output_text, add_special_tokens=True, truncation=True, padding=True)
  101. # Convert tokens to tensors
  102. input_tensor = torch.tensor(input_tokens).unsqueeze(0) # Add batch dimension
  103. output_tensor = torch.tensor(output_tokens).unsqueeze(0) # Add batch dimension
  104. # Initialize the transformer model
  105. model = Transformer(input_size, output_size, hidden_size, num_layers, num_heads, ff_size, dropout)
  106. # Example training loop
  107. num_epochs = 10
  108. learning_rate = 0.001
  109. optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
  110. for epoch in range(num_epochs):
  111. # Perform forward pass
  112. output = model(input_tensor, output_tensor)
  113. # Compute loss
  114. loss = F.cross_entropy(output.view(-1, output_size), output_tensor.view(-1))
  115. # Backpropagation and optimization
  116. optimizer.zero_grad()
  117. loss.backward()
  118. optimizer.step()
  119. # Print the loss for monitoring
  120. print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")
  121. # Test the trained model with input text
  122. test_input_text = "How's the weather today?"
  123. test_input_tokens = tokenizer.encode(test_input_text, add_special_tokens=True, truncation=True, padding=True)
  124. test_input_tensor = torch.tensor(test_input_tokens).unsqueeze(0) # Add batch dimension
  125. # Set the model to evaluation mode
  126. model.eval()
  127. # Perform forward pass with the test input
  128. test_output = model(test_input_tensor, torch.zeros(1, 1).long())
  129. # Get the predicted class
  130. _, predicted_classes = torch.max(test_output, dim=2)
  131. predicted_classes = predicted_classes.squeeze(0).tolist()
  132. # Convert the predicted class tokens to text
  133. predicted_text = tokenizer.decode(predicted_classes, skip_special_tokens=True)
  134. # Print the predicted output text
  135. print(f"Input: {test_input_text}")
  136. print(f"Predicted Output: {predicted_text}")

The code imports the required libraries and defines a Transformer model with an Encoder and a Decoder. The Encoder and the Decoder consist of multiple transformer layers. The Transformer model is trained using the Adam optimizer and Cross-Entropy loss function. The trained model is then used to predict the output for a test input.

The code also uses the BartTokenizer from the transformers library to tokenize the input and output text.


The following errors are encountered while running the code:

  1. RuntimeError: shape '[1, 40, 64]' is invalid for input of size 2048


得分: 1

Here's the translated code without any additional information:

  1. from transformers import BartTokenizer
  2. # Example training loop
  3. input_text = "Hello, how are you?"
  4. output_text = "I am doing well, thank you."
  5. # Tokenization using Transformers
  6. tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
  7. tokenizer.add_tokens(["<custom_token_1>", "<custom_token_2>"])
  8. # Tokenize input and output text
  9. input_tokens = tokenizer.encode(input_text, add_special_tokens=True, truncation=True, padding=True)
  10. output_tokens = tokenizer.encode(output_text, add_special_tokens=True, truncation=True, padding=True)
  11. # Convert tokens to tensors
  12. input_tensor = torch.tensor(input_tokens).unsqueeze(0) # Add batch dimension
  13. output_tensor = torch.tensor(output_tokens).unsqueeze(0) # Add batch dimension
  14. print(input_tensor.shape, output_tensor.shape)

Here's the error part:

  1. import torch
  2. import torch.nn as nn
  3. import torch.nn.functional as F
  4. # ... (Rest of the code)
  5. encoded = encoder(input_tensor)
  6. decoder(output_tensor, encoded)

And the part where the error is explained and fixed:

  1. input_ids, output_ids = tokenizer(
  2. [input_text, output_text],
  3. add_special_tokens=True, truncation=True, padding=True,
  4. pad_to_max_length=True
  5. )['input_ids']
  6. # Convert tokens to tensors
  7. input_tensor = torch.tensor(input_ids).unsqueeze(0) # Add batch dimension
  8. output_tensor = torch.tensor(output_ids).unsqueeze(0) # Add batch dimension
  9. encoded = encoder(input_tensor)
  10. decoder(output_tensor, encoded)

Let me know if you need anything else.


Lets try to walk through your code from the tokenizer:

  1. from transformers import BartTokenizer
  2. # Example training loop
  3. input_text = &quot;Hello, how are you?&quot;
  4. output_text = &quot;I am doing well, thank you.&quot;
  5. # Tokenization using Transformers
  6. tokenizer = BartTokenizer.from_pretrained(&#39;facebook/bart-large-cnn&#39;)
  7. tokenizer.add_tokens([&quot;&lt;custom_token_1&gt;&quot;, &quot;&lt;custom_token_2&gt;&quot;])
  8. # Tokenize input and output text
  9. input_tokens = tokenizer.encode(input_text, add_special_tokens=True, truncation=True, padding=True)
  10. output_tokens = tokenizer.encode(output_text, add_special_tokens=True, truncation=True, padding=True)
  11. # Convert tokens to tensors
  12. input_tensor = torch.tensor(input_tokens).unsqueeze(0) # Add batch dimension
  13. output_tensor = torch.tensor(output_tokens).unsqueeze(0) # Add batch dimension
  14. print(input_tensor.shape, output_tensor.shape)


  1. torch.Size([1, 8]) torch.Size([1, 10])

From here you kind of sense that might be problematic, the sizes are different for every data point and different for source and target too. You'll need to pad the inputs to make them the same size when the model sees it.

Check the Annotated Transformer code,, there is a collate_batch function that handles all these operations.

Next, try to put in the input/output into one pass through the model:

  1. import torch
  2. import torch.nn as nn
  3. import torch.nn.functional as F
  4. class TransformerEncoder(nn.Module):
  5. def __init__(self, input_size, hidden_size, num_layers, num_heads, ff_size, dropout):
  6. super(TransformerEncoder, self).__init__()
  7. self.embedding = nn.Embedding(input_size, hidden_size)
  8. self.pos_embedding = nn.Embedding(1000, hidden_size) # Positional embedding
  9. self.dropout = nn.Dropout(dropout)
  10. self.self_attention = nn.MultiheadAttention(hidden_size, num_heads)
  11. self.layer_norm1 = nn.LayerNorm(hidden_size)
  12. self.feed_forward = nn.Sequential(
  13. nn.Linear(hidden_size, ff_size),
  14. nn.ReLU(),
  15. nn.Linear(ff_size, hidden_size)
  16. )
  17. self.layer_norm2 = nn.LayerNorm(hidden_size)
  18. self.num_layers = num_layers
  19. def forward(self, input_seq):
  20. seq_len, batch_size = input_seq.size()
  21. # Embedding and positional encoding
  22. embedded = self.embedding(input_seq)
  23. pos_ids = torch.arange(seq_len, device=input_seq.device).unsqueeze(0).expand_as(input_seq)
  24. pos_embedded = self.pos_embedding(pos_ids)
  25. encoded = self.dropout(embedded + pos_embedded)
  26. # Transformer encoder layers
  27. for _ in range(self.num_layers):
  28. # Self-attention
  29. attention_output, _ = self.self_attention(encoded, encoded, encoded)
  30. attention_output = self.layer_norm1(encoded + self.dropout(attention_output))
  31. # Feed-forward
  32. ff_output = self.feed_forward(attention_output)
  33. ff_output = self.layer_norm2(attention_output + self.dropout(ff_output))
  34. encoded = ff_output
  35. return encoded
  36. class TransformerDecoder(nn.Module):
  37. def __init__(self, output_size, hidden_size, num_layers, num_heads, ff_size, dropout):
  38. super(TransformerDecoder, self).__init__()
  39. self.embedding = nn.Embedding(output_size, hidden_size)
  40. self.pos_embedding = nn.Embedding(1000, hidden_size) # Positional embedding
  41. self.dropout = nn.Dropout(dropout)
  42. self.self_attention = nn.MultiheadAttention(hidden_size, num_heads)
  43. self.layer_norm1 = nn.LayerNorm(hidden_size)
  44. self.encoder_attention = nn.MultiheadAttention(hidden_size, num_heads)
  45. self.layer_norm2 = nn.LayerNorm(hidden_size)
  46. self.feed_forward = nn.Sequential(
  47. nn.Linear(hidden_size, ff_size),
  48. nn.ReLU(),
  49. nn.Linear(ff_size, hidden_size)
  50. )
  51. self.layer_norm3 = nn.LayerNorm(hidden_size)
  52. self.fc = nn.Linear(hidden_size, output_size)
  53. self.num_layers = num_layers
  54. def forward(self, input_seq, encoder_output):
  55. seq_len, batch_size = input_seq.size()
  56. # Embedding and positional encoding
  57. embedded = self.embedding(input_seq)
  58. pos_ids = torch.arange(seq_len, device=input_seq.device).unsqueeze(0).expand_as(input_seq)
  59. pos_embedded = self.pos_embedding(pos_ids)
  60. encoded = self.dropout(embedded + pos_embedded)
  61. # Transformer decoder layers
  62. for _ in range(self.num_layers):
  63. # Self-attention
  64. self_attention_output, _ = self.self_attention(encoded, encoded, encoded)
  65. self_attention_output = self.layer_norm1(encoded + self.dropout(self_attention_output))
  66. # Encoder-decoder attention
  67. encoder_attention_output, _ = self.encoder_attention(self_attention_output, encoder_output, encoder_output)
  68. encoder_attention_output = self.layer_norm2(self_attention_output + self.dropout(encoder_attention_output))
  69. # Feed-forward
  70. ff_output = self.feed_forward(encoder_attention_output)
  71. ff_output = self.layer_norm3(encoder_attention_output + self.dropout(ff_output))
  72. encoded = ff_output
  73. output = self.fc(encoded)
  74. return output
  75. # Example parameters
  76. input_size = 50265 # Vocabulary size
  77. output_size = 50265 # Vocabulary size
  78. hidden_size = 256 # Hidden state size of the transformer layers
  79. num_layers = 2 # Number of transformer layers
  80. num_heads = 4 # Number of attention heads
  81. ff_size = 1024 # Feed-forward layer size
  82. dropout = 0.1 # Dropout rate
  83. encoder = TransformerEncoder(input_size, hidden_size, num_layers, num_heads, ff_size, dropout)
  84. decoder = TransformerDecoder(output_size, hidden_size, num_layers, num_heads, ff_size, dropout)
  85. encoded = encoder(input_tensor)
  86. decoder(output_tensor, encoded)

And you will see this error:

  1. ---------------------------------------------------------------------------
  2. RuntimeError Traceback (most recent call last)
  3. Cell In[23], line 1
  4. ----&gt; 1 decoder(output_tensor, encoded)
  5. File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/, in Module._call_impl(self, *args, **kwargs)
  6. 1496 # If we don&#39;t have any hooks, we want to skip the rest of the logic in
  7. 1497 # this function, and just call forward.
  8. 1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
  9. 1499 or _global_backward_pre_hooks or _global_backward_hooks
  10. 1500 or _global_forward_hooks or _global_forward_pre_hooks):
  11. -&gt; 1501 return forward_call(*args, **kwargs)
  12. 1502 # Do not call functions when jit is used
  13. 1503 full_backward_hooks, non_full_backward_hooks = [], []
  14. Cell In[20], line 88, in TransformerDecoder.forward(self, input_seq, encoder_output)
  15. 85 self_attention_output = self.layer_norm1(encoded + self.dropout(self_attention_output))
  16. 87 # Encoder-decoder attention
  17. ---&gt; 88 encoder_attention_output, _ = self.encoder_attention(self_attention_output, encoder_output, encoder_output)
  18. 89 encoder_attention_output = self.layer_norm2(self_attention_output + self.dropout(encoder_attention_output))
  19. 91 # Feed-forward
  20. File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/, in Module._call_impl(self, *args, **kwargs)
  21. 1496 # If we don&#39;t have any hooks, we want to skip the rest of the logic in
  22. 1497 # this function, and just call forward.
  23. 1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
  24. 1499 or _global_backward_pre_hooks or _global_backward_hooks
  25. 1500 or _global_forward_hooks or _global_forward_pre_hooks):
  26. -&gt; 1501 return forward_call(*args, **kwargs)
  27. 1502 # Do not call functions when jit is used
  28. 1503 full_backward_hooks, non_full_backward_hooks = [], []
  29. File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/, in MultiheadAttention.forward(self, query, key, value, key_padding_mask, need_weights, attn_mask, average_attn_weights, is_causal)
  30. 1175 attn_output, attn_output_weights = F.multi_head_attention_forward(
  31. 1176 query, key, value, self.embed_dim, self.num_heads,
  32. 1177 self.in_proj_weight, self.in_proj_bias,
  33. (...)
  34. 1186 average_attn_weights=average_attn_weights,
  35. 1187 is_causal=is_causal)
  36. 1188 else:
  37. -&gt; 1189 attn_output, attn_output_weights = F.multi_head_attention_forward(
  38. 1190 query, key, value, self.embed_dim, self.num_heads,
  39. 1191 self.in_proj_weight, self.in_proj_bias,
  40. 1192 self.bias_k, self.bias_v, self.add_zero_attn,
  41. 1193 self.dropout, self.out_proj.weight, self.out_proj.bias,
  42. 1194,
  43. 1195 key_padding_mask=key_padding_mask,
  44. 1196 need_weights=need_weights,
  45. 1197 attn_mask=attn_mask,
  46. 1198 average_attn_weights=average_attn_weights,
  47. 1199 is_causal=is_causal)
  48. 1200 if self.batch_first and is_batched:
  49. 1201 return attn_output.transpose(1, 0), attn_output_weights
  50. File /opt/conda/lib/python3.10/site-packages/torch/nn/, in multi_head_attention_forward(query, key, value, embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias, bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight, out_proj_bias, training, key_padding_mask, need_weights, attn_mask, use_separate_proj_weight, q_proj_weight, k_proj_weight, v_proj_weight, static_k, static_v, average_attn_weights, is_causal)
  51. 5241 q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
  52. 5242 if static_k is None:
  53. -&gt; 5243 k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
  54. 5244 else:
  55. 5245 # TODO finish disentangling control flow so we don&#39;t do in-projections when statics are passed
  56. 5246 assert static_k.size(0) == bsz * num_heads, \
  57. 5247 f&quot;expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}&quot;
  58. RuntimeError: shape &#39;[1, 40, 64]&#39; is invalid for input of size 2048

Now that we establish that the error comes from multi_head_attention_forward, we need to ask what does that mean?

Q: When doing attention, does the dot products of the input and output tensor need to be the same size?


> The dot product of these two vectors is the sum of the products of elements at each position. In this case, the dot product is (12)+(24)+(3*6) . Since we multiply elements at the same positions, the two vectors must have the same length in order to have a dot product.

Now lets try to encode and decode the same input to input

  1. encoded = encoder(input_tensor)
  2. decoder(input_tensor, encoded)


  1. tensor([[[-0.2148, 0.1838, 1.0957, ..., 0.0339, 0.5636, 0.0724],
  2. [-0.3618, 0.2255, -0.0054, ..., -0.8457, 0.7408, -0.2420],
  3. [-0.4362, -0.1221, -0.1487, ..., -1.0708, 0.2463, 0.3195],
  4. ...,
  5. [-0.4172, 0.4385, 0.0669, ..., -0.4320, 0.7453, 0.4359],
  6. [-0.7655, 0.2335, -0.0070, ..., -0.2925, -0.0765, -0.0452],
  7. [ 0.1498, 0.4205, 0.2231, ..., -0.6602, -0.4533, 0.1439]]],
  8. grad_fn=&lt;ViewBackward0&gt;)

It works!

But I want it to translate input to output, not copy the input!

A: Try padding your input and output tensors

  1. input_ids, output_ids = tokenizer(
  2. [input_text, output_text],
  3. add_special_tokens=True, truncation=True, padding=True,
  4. pad_to_max_length=True
  5. )[&#39;input_ids&#39;]
  6. print(input_ids, output_ids)


  1. ([0, 31414, 6, 141, 32, 47, 116, 2, 1, 1],
  2. [0, 100, 524, 608, 157, 6, 3392, 47, 4, 2])

Now the model should pass through properly with the inputs padded with id 1.

  1. input_ids, output_ids = tokenizer(
  2. [input_text, output_text],
  3. add_special_tokens=True, truncation=True, padding=True,
  4. pad_to_max_length=True
  5. )[&#39;input_ids&#39;]
  6. # Convert tokens to tensors
  7. input_tensor = torch.tensor(input_ids).unsqueeze(0) # Add batch dimension
  8. output_tensor = torch.tensor(output_ids).unsqueeze(0) # Add batch dimension
  9. encoded = encoder(torch.tensor(input_tensor))
  10. decoder(torch.tensor(output_tensor), encoded)


  1. tensor([[[-0.2411, 0.0112, 0.7898, ..., 0.0268, 0.5292, 0.3200],
  2. [-0.4188, 0.2833, -0.0789, ..., -0.4128, 0.5024, -0.2907],
  3. [-1.0449, -0.9079, -0.3526, ..., -0.5538, 0.7915, 0.8056],
  4. ...,
  5. [-0.5720, 0.9486, 0.4666, ..., -0.2160, 0.1702, 0.4758],
  6. [-0.1059, 0.2155, 0.8135, ..., -0.8385, 0.4577, 0.0031],
  7. [-0.0573, 0.7191, 0.0519, ..., -0.2227, -0.1204, -0.3053]]],
  8. grad_fn=&lt;ViewBackward0&gt;)

  • 本文由 发表于 2023年5月22日 06:54:28
  • 转载请务必保留本文链接:



:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:
