英文:
Why is PyTorch significantly slower than Tensorflow? Both are running CUDA
问题
我刚刚在测试tensorflow 2.0和pytorch 2.0之间的速度(因为我刚开始学习pytorch),从我所了解的情况来看,使用完全相同的模型架构、相同的批处理大小、相同的优化器,并且都在运行CUDA的情况下,pytorch的运行时间大约是tensorflow的3倍(tensorflow为1分钟,pytorch为3分钟),而且准确率也较低(tensorflow验证集准确率为83%,pytorch为78%)。
以下是tensorflow的代码:
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.fashion_mnist.load_data()
model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=(28,28)),
tf.keras.layers.Dense(512, activation="relu"),
tf.keras.layers.Dense(512, activation="relu"),
tf.keras.layers.Dense(10, activation="softmax")
])
model.compile(optimizer=tf.keras.optimizers.SGD(1e-3), loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.fit(train_images, train_labels, epochs=20, batch_size=64, validation_data=(test_images, test_labels))
以下是pytorch的代码:
device = torch.device("cuda")
training_data = datasets.FashionMNIST(
root="data",
train=True,
download=True,
transform=ToTensor()
)
test_data = datasets.FashionMNIST(
root="data",
train=False,
download=True,
transform=ToTensor()
)
train_dataloader = DataLoader(training_data, batch_size=64)
test_dataloader = DataLoader(test_data, batch_size=64)
class NeuralNetwork(nn.Module):
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10),
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
model = NeuralNetwork()
model.to(device)
learning_rate = 1e-3
batch_size = 64
epochs = 5
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
def train_loop(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
model.train()
for batch, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)
pred = model(X)
loss = loss_fn(pred, y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
if batch % 100 == 0:
loss, current = loss.item(), (batch + 1) * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def test_loop(dataloader, model, loss_fn):
model.eval()
size = len(dataloader.dataset)
num_batches = len(dataloader)
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
X, y = X.to(device), y.to(device)
pred = model(X)
test_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= num_batches
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
epochs = 20
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train_loop(train_dataloader, model, loss_fn, optimizer)
test_loop(test_dataloader, model, loss_fn)
print("Done!")
我注意到tensorflow使用了大约60%的CUDA和全部的GPU内存,而pytorch在0%和30%之间切换,并且几乎不使用GPU内存。
这篇帖子存在于这里,但那是由于CUDA_LAUNCH_BLOCKING
,而我的代码中没有这个。
配置信息:
- RTX 3060
- Intel i710700,超频至约4.2GHz
- 64GB内存
编辑:我尝试增加了Dataloader的工作线程数并固定了内存,但只能为20个epochs节省约15秒时间,而torch.backends.cudnn.benchmark = True
则没有任何帮助。
英文:
I was just testing out the speeds between tensorflow 2.0 and pytorch 2.0 (since I'm just starting to learn pytorch), and from what I've realized, with the exact same model architecture, same batch size, same optimizer, and both are running CUDA, the pytorch one takes about 3 times as long to run (1 minute of tf vs 3 minutes of pyt), also with a worse accuracy (83% for the validation set for tf and 78% for pyt).
Here is the code for tensorflow:
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.fashion_mnist.load_data()
model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=(28,28)),
tf.keras.layers.Dense(512, activation="relu"),
tf.keras.layers.Dense(512, activation="relu"),
tf.keras.layers.Dense(10, activation="softmax")
])
model.compile(optimizer=tf.keras.optimizers.SGD(1e-3), loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.fit(train_images, train_labels, epochs=20, batch_size=64, validation_data=(test_images, test_labels))
And here is the code for pytorch:
device = torch.device("cuda")
training_data = datasets.FashionMNIST(
root="data",
train=True,
download=True,
transform=ToTensor()
)
test_data = datasets.FashionMNIST(
root="data",
train=False,
download=True,
transform=ToTensor()
)
train_dataloader = DataLoader(training_data, batch_size=64)
test_dataloader = DataLoader(test_data, batch_size=64)
class NeuralNetwork(nn.Module):
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10),
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
model = NeuralNetwork()
model.to(device)
learning_rate = 1e-3
batch_size = 64
epochs = 5
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
def train_loop(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
model.train()
for batch, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)
pred = model(X)
loss = loss_fn(pred, y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
if batch % 100 == 0:
loss, current = loss.item(), (batch + 1) * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def test_loop(dataloader, model, loss_fn):
model.eval()
size = len(dataloader.dataset)
num_batches = len(dataloader)
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
X, y = X.to(device), y.to(device)
pred = model(X)
test_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= num_batches
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
epochs = 20
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train_loop(train_dataloader, model, loss_fn, optimizer)
test_loop(test_dataloader, model, loss_fn)
print("Done!")
What I've noticed is that tensorflow uses about 60% of my CUDA, and all of my dedicated GPU memory, while pytorch goes on and off between 0% and 30%, and not using much GPU memory at all.
This post exists, but that was due to CUDA_LAUNCH_BLOCKING
, which is not present in my code.
Specs:
- RTX 3060
- Intel i710700, overclocked to ~4.2GHz
- 64GB memory
Edit: I've tried upping the workers and pinning memory for Dataloaders, but it only saved around 15 seconds for 20 epochs, and torch.backends.cudnn.benchmark = True
doesn't help at all.
答案1
得分: 0
我已经将你的代码从Tensorflow转换为Pytorch,它的运行速度更快,所以请检查代码的版本并在你的设备上使用它。
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader
# 定义模型
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.flatten = nn.Flatten()
self.fc1 = nn.Linear(28*28, 512)
self.fc2 = nn.Linear(512, 512)
self.fc3 = nn.Linear(512, 10)
def forward(self, x):
x = self.flatten(x)
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
x = self.fc3(x)
return x
# 加载数据
training_data = datasets.FashionMNIST(
root="data",
train=True,
download=True,
transform=ToTensor()
)
test_data = datasets.FashionMNIST(
root="data",
train=False,
download=True,
transform=ToTensor()
)
train_loader = DataLoader(training_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64)
# 创建模型并将其移动到CUDA设备(如果可用)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NeuralNetwork().to(device)
# 定义损失函数和优化器
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=1e-3)
# 训练循环
for epoch in range(10):
model.train()
for images, labels in train_loader:
images, labels = images.to(device), labels.to(device)
# 前向传播
pred = model(images)
loss = loss_fn(pred, labels)
# 反向传播和优化
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Epoch {epoch}: Loss: {loss.item()}")
# 在测试数据上进行验证
model.eval()
correct = 0
total = 0
with torch.no_grad():
for images, labels in test_loader:
images, labels = images.to(device), labels.to(device)
pred = model(images)
_, predicted = torch.max(pred, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
accuracy = correct / total
print(f'Accuracy: {accuracy * 100}%')
英文:
I have converted your code from Tensorflow to Pytorch and it works faster so check the version of the code and use it on your device.
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader
# Define the model
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.flatten = nn.Flatten()
self.fc1 = nn.Linear(28*28, 512)
self.fc2 = nn.Linear(512, 512)
self.fc3 = nn.Linear(512, 10)
def forward(self, x):
x = self.flatten(x)
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
x = self.fc3(x)
return x
# Load the data
training_data = datasets.FashionMNIST(
root="data",
train=True,
download=True,
transform=ToTensor()
)
test_data = datasets.FashionMNIST(
root="data",
train=False,
download=True,
transform=ToTensor()
)
train_loader = DataLoader(training_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64)
# Create the model and move it to CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NeuralNetwork().to(device)
# Define loss and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=1e-3)
# Training loop
for epoch in range(10):
model.train()
for images, labels in train_loader:
images, labels = images.to(device), labels.to(device)
# Forward pass
pred = model(images)
loss = loss_fn(pred, labels)
# Backward pass and optimization
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Epoch {epoch}: Loss: {loss.item()}")
# Validate on the test data
model.eval()
correct = 0
total = 0
with torch.no_grad():
for images, labels in test_loader:
images, labels = images.to(device), labels.to(device)
pred = model(images)
_, predicted = torch.max(pred, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
accuracy = correct / total
print(f'Accuracy: {accuracy * 100}%')
答案2
得分: 0
在运行一个用于优化num_workers
的测试时,我发现是我的DataLoader
在拖慢速度。在查阅文档后,我发现了persistent_workers=True
这个参数,它解决了我的问题。
代码的运行时间从大约3分钟缩短到了不到25秒。
英文:
While I was running a test to optimize num_workers outlined in this medium article, I figured that it my DataLoader that is slowing me down. After looking through the documentation, I came across persistent_workers=True
, which solved the problem for me.
The code went from ~3 minutes to under ~25 seconds.
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论