问题

以下是您提供的代码的翻译部分：

我创建了一个用于目标检测的自定义数据集，命名为ReceiptDataset，如下所示。

from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

class ReceiptDataset(torch.utils.data.Dataset):
  def __init__(self, train_dir, width, height, labels, transforms=None):
    self.images = os.listdir(train_dir)
    self.width = width
    self.height = height
    self.train_dir = train_dir
    self.labels = labels
    self.transforms = transforms

  def __getitem__(self, idx):
    img_name = self.images[idx]
    img_path = os.path.join(self.train_dir, img_name)

    img = cv2.imread(img_path)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
    img_res = cv2.resize(img_rgb, (self.width, self.height), cv2.INTER_AREA)

    img_res /= 255.0

    annot = self.labels[str(img_name)]

    lbls = []
    boxes = []
    target = {}

    ht, wt, _ = img.shape

    for item in annot:
      x, y, box_wt, box_ht, lbl = item

      x_min = x
      x_max = x + box_wt
      y_min = y
      y_max = y + box_ht

      x_min_corr = (x_min / wt) * self.width
      x_max_corr = (x_max / wt) * self.width
      y_min_corr = (y_min / ht) * self.height
      y_max_corr = (y_max / ht) * self.height

      boxes.append([x_min_corr, y_min_corr, x_max_corr, y_max_corr])

      lbls.append(classes.index(str(lbl))

    boxes = torch.as_tensor(boxes, dtype=torch.float32)
    lbls = torch.as_tensor(lbls, dtype=torch.int64)

    area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])

    iscrowd = torch.zeros((boxes.shape[0],), dtype=torch.int64)

    target["boxes"] = boxes
    target["labels"] = lbls
    target["image_id"] = torch.as_tensor(idx)
    target["area"] = area
    target["iscrowd"] = iscrowd

    if self.transforms:
      trans = self.transforms(image=img_res, bboxes=target["boxes"], labels=lbls)
      img_res = trans["image"]
      target["boxes"] = torch.Tensor(trans["bboxes"])

    return img_res, target

  def __len__(self):
    return len(self.images)

我创建了一个名为train_dataset的实例：

train_dataset = ReceiptDataset("label-detector/images", width, height, plabels)

我的训练代码段如下：

from engine import train_one_epoch, evaluate

for epoch in range(num_epochs):
  train_one_epoch(model, optim, train_loader, device, epoch, print_freq=2)

  lr_scheduler.step()

  evaluate(model, test_loader, device)

但每次运行训练循环时，都会出现运行时错误：

RuntimeError: stack 期望每个张量具有相同的大小，但在条目0和条目1处分别为[11,4]和[9,4]。

总共有17个类别，每个图像至少有4个注释。我注意到问题似乎来自数据集类中的标签列表/张量的大小，标签列表/张量的大小根据图像中注释项的数量而变化，但我无法弄清楚如何解决这个问题。

谢谢！

英文:

I created a custom dataset for object detection named ReceiptDataset as below.

from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
class ReceiptDataset(torch.utils.data.Dataset):
def __init__(self, train_dir,width,height,labels,transforms=None):
self.images = os.listdir(train_dir)
self.width = width
self.height = height
self.train_dir = train_dir
self.labels = labels
self.transforms = transforms
def __getitem__(self,idx):
img_name = self.images[idx]
img_path = os.path.join(self.train_dir,img_name)
#print(f&quot;img_name: {img_name}&quot;)
img = cv2.imread(img_path)
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
img_res = cv2.resize(img_rgb,(self.width,self.height), cv2.INTER_AREA)
img_res /= 255.0
annot = self.labels[str(img_name)]
lbls = []
boxes = []
target = {}
ht, wt, _ = img.shape
#print(f&quot;img_res shape: {img_res.shape}, orig shape: {wt}, {ht}&quot;)
for item in annot:
x,y,box_wt,box_ht,lbl = item
x_min = x
x_max = x + box_wt
y_min = y
y_max = y + box_ht
x_min_corr = (x_min / wt) * self.width
x_max_corr = (x_max /wt ) * self.width
y_min_corr = (y_min / ht) * self.height
y_max_corr = (y_max / ht) * self.height
boxes.append([x_min_corr, y_min_corr, x_max_corr, y_max_corr])
lbls.append( classes.index(str(lbl)) )
#print(f&quot;dls_lbls: {lbls}, {len(lbls)}&quot;)
#lbls += [-1] * (NUM_CLASSES - len(lbls))
boxes = torch.as_tensor(boxes, dtype=torch.float32)
lbls = torch.as_tensor(lbls, dtype=torch.int64)
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
iscrowd = torch.zeros((boxes.shape[0],), dtype=torch.int64)
target[&quot;boxes&quot;]  = boxes
target[&quot;labels&quot;] = lbls
target[&quot;image_id&quot;] = torch.as_tensor(idx)
target[&quot;area&quot;] = area
target[&quot;iscrowd&quot;] = iscrowd
#print(f&quot;dls_lbls -- 2: {target[&#39;labels&#39;]}, { target[&#39;labels&#39;].shape }&quot;)
if self.transforms:
trans = self.transforms(image=img_res,
bboxes = target[&quot;boxes&quot;],
labels=lbls
)
img_res = trans[&quot;image&quot;]
target[&quot;boxes&quot;] = torch.Tensor(trans[&quot;bboxes&quot;])
return img_res, target
def __len__(self):
return len(self.images)

and I created an instance with:

train_dataset = ReceiptDataset(&quot;label-detector/images&quot;,width,height,plabels)

and my training snippet is :

from engine import train_one_epoch, evaluate
for epoch in range(num_epochs):
train_one_epoch(model,optim,train_loader,device,epoch,print_freq=2)
lr_scheduler.step()
evaluate(model,test_loader,device)

but anytime I run the training loop, I’m getting a runtime error:

RuntimeError: stack expects each tensor to be equal size, but got [11,4] at entry 0 and [9,4] at entry 1

There are 17 classes in total and each image has a minimum of 4 annotations.
I noticed the problem seems to be coming from my labels list/tensor in the dataset class, the size of the labels list/tensor varies based on the number of annotated items in an image, but I can’t seem to figure out a way to fix this.

Thank you!

答案1

得分: 0

我通过为数据加载器实现一个自定义的整理函数来解决这个问题，该函数返回适合我的模型所需的数据批次。

def collate_fn_seq(batch):
    images = [item[0] for item in batch]
    targets = [item[1] for item in batch]

    imgs = []
    for image in images:
        img = torch.from_numpy(image).permute(2, 0, 1)
        imgs.append(img)

    boxes = [target["boxes"] for target in targets]

    labels = [target["labels"] for target in targets]

    image_ids = [target["image_id"] for target in targets]
    areas = [target["area"] for target in targets]
    iscrowds = [target["iscrowd"] for target in targets]

    tars = []

    for i in range(len(batch)):
        box = boxes[i]
        label = labels[i]
        image_id = image_ids[i]
        area = areas[i]
        iscrowd = iscrowds[i]

        target = {"boxes": box, "labels": label, "image_id": image_id, "area": area, "iscrowd": iscrowd}
        tars.append(target)

    return imgs, tars

并在我的数据加载器中包含它：

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn_seq)

英文:

I solved it by implementing a custom collate function for the dataloader that returns a batch of my dataset as needed by my model.

def collate_fn_seq(batch):
images = [ item[0] for item in batch ]
targets = [ item[1] for item in batch ]
imgs = []
for image in images:
img = torch.from_numpy(image).permute(2, 0, 1)
imgs.append(img)
boxes = [target[&quot;boxes&quot;] for target in targets]
labels = [target[&quot;labels&quot;] for target in targets]
image_ids = [ target[&quot;image_id&quot;] for target in targets ]
areas = [target[&quot;area&quot;] for target in targets]
iscrowds = [target[&quot;iscrowd&quot;] for target in targets]
tars = []
for i in range(len(batch)):
box = boxes[i]
label = labels[i]
image_id = image_ids[i]
area = areas[i]
iscrowd = iscrowds[i]
target = {&quot;boxes&quot;: box, &quot;labels&quot;: label, &quot;image_id&quot;: image_id, &quot;area&quot;: area, &quot;iscrowd&quot;: iscrowd}
tars.append(target)
return imgs, tars

and included it in my dataloaders using:

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn_seq)

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

Object Detection – 运行时错误：stack 期望每个张量具有相同的尺寸

问题

答案1

将迭代器中的值附加到对象中。

如何更改ResNet 18中的第一个卷积层？

Getting ModuleNotFoundError: No module named ‘torch.distributed._shard’

Error: ImportError: 无法从’torchvision.models.vgg’导入’model_urls’。

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

发表评论