如何在自定义数据生成器中获取正确的混淆矩阵数据?

huangapple go评论76阅读模式
英文:

How to get correct confusion_matrix data in customdatagenerator

问题

我正在构建混淆矩阵,但是我总是返回错误形状的y_true。

我认为我的y_label是正确的,我有62个验证数据。

我不知道y_true应该在哪里声明,以及如何获取y_true。

ValueError
Found input variables with inconsistent numbers of samples: [63, 62]
  File "C:\Labbb\inceptionResnetV2\InceptionResnetV2_1.py", line 213, in <module>
    sns.heatmap(confusion_matrix(y_true, y_pred),
ValueError: Found input variables with inconsistent numbers of samples: [63, 62]

我尝试在get_data中添加self.y_true,使用def get_y_true返回self.y_true,并在on_epoch_end中使用"self.y_true = []",shuffle=False。

这是CustomDataGenerator的代码。

我应该在哪里声明"self.y_true = []"?

train_dir = r'C:\Labbb\mergeimage_npy\512512\npy\train'
valid_dir = r'C:\Labbb\mergeimage_npy\512512\npy\val'
image_folders = ['image0', 'image1', 'image2', 'image3', 'image4', 'image6', 'image7']  
label_folders = ['label0', 'label1', 'label2', 'label3', 'label4', 'label6', 'label7']  
class CustomDataGenerator(Sequence):
    def __init__(self, image_folders, label_folders, dir, dim=(512,512),  batch_size=1,n_classes=7,n_channels=8,shuffle=True):
        self.image_folders = image_folders
        ...
        self.image_paths = []
        self.label_paths = []
        self.on_epoch_end()
    def __len__(self):
        return int(np.ceil(len(self.image_paths) / self.batch_size))  
    def __getitem__(self, index):
        batch_image_paths = self.image_paths[index * self.batch_size: (index + 1) * self.batch_size]
        batch_label_paths = self.label_paths[index * self.batch_size: (index + 1) * self.batch_size]
        batch = zip(batch_image_paths, batch_label_paths)
        return self.get_data(batch)
    def on_epoch_end(self):
        self.image_paths = []
        self.label_paths = []
        
        self.y_true = []
        for folder in self.image_folders:
            image_folder_path = os.path.join(self.dir, folder)
            image_files = os.listdir(image_folder_path)
            for file_name in image_files:
                self.image_paths.append(os.path.join(image_folder_path, file_name))
        for folder in self.label_folders:
            ...
                
        if self.shuffle:
            np.random.shuffle(self.image_paths)
            np.random.shuffle(self.label_paths)
    def get_data(self, batch):
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size, self.n_classes))
        
        for i, (image_path, label_path) in enumerate(batch):
            image = np.load(image_path)
            with open(label_path, 'r') as f:
                line = f.readline().strip()
                filepath, label = line.rsplit(' ', 1)
                label = int(label)
                self.y_true.append(label)
            label_one_hot = to_categorical(label, num_classes=self.n_classes)

            X[i,] = image
            y[i,] = label_one_hot
            
        return X, y
    
    def get_y_true(self):
        return self.y_true

这是获取y_true和y_pred,并构建混淆矩阵的代码。

在这里,"y_true = val_datagen.get_y_true()"应该放在这行代码"Y_pred = model.predict"之前还是之后?

train_datagen = CustomDataGenerator(image_folders, label_folders, train_dir, **params, shuffle = True)
val_datagen = CustomDataGenerator(image_folders, label_folders, valid_dir, **params, shuffle = False)

y_true = val_datagen.get_y_true()
Y_pred = model.predict(val_datagen)
y_pred = np.argmax(Y_pred, axis=1) 
fig, ax = plt.subplots(figsize=(12,6))  
sns.heatmap(confusion_matrix(y_true, y_pred),annot=True, fmt="d", cmap='Greens',ax = ax)
英文:

I'm building confusion_matrix, but I always return wrong shape y_true

I think my y_label is correct, I have 62 val data

I dont know y_true should be declare at where and where to get y_true

ValueError
Found input variables with inconsistent numbers of samples: [63, 62]
File &quot;C:\Labbb\inceptionResnetV2\InceptionResnetV2_1.py&quot;, line 213, in &lt;module&gt;
sns.heatmap(confusion_matrix(y_true, y_pred),
ValueError: Found input variables with inconsistent numbers of samples: [63, 62]

I try to append self.y_true in get_data, use def get_y_true return self.y_true, and "self.y_true = []" in on_epoch_end, shuffle=False.

Here is CustomDataGenerator.

Where should I declare "self.y_true = []"

train_dir = r&#39;C:\Labbb\mergeimage_npy\512512\npy\train&#39;
valid_dir = r&#39;C:\Labbb\mergeimage_npy\512512\npy\val&#39;
image_folders = [&#39;image0&#39;, &#39;image1&#39;, &#39;image2&#39;, &#39;image3&#39;, &#39;image4&#39;, &#39;image6&#39;, &#39;image7&#39;]  
label_folders = [&#39;label0&#39;, &#39;label1&#39;, &#39;label2&#39;, &#39;label3&#39;, &#39;label4&#39;, &#39;label6&#39;, &#39;label7&#39;]  
class CustomDataGenerator(Sequence):
def __init__(self, image_folders, label_folders, dir, dim=(512,512),  batch_size=1,n_classes=7,n_channels=8,shuffle=True):
self.image_folders = image_folders
...
self.image_paths = []
self.label_paths = []
self.on_epoch_end()
def __len__(self):
return int(np.ceil(len(self.image_paths) / self.batch_size))  
def __getitem__(self, index):
batch_image_paths = self.image_paths[index * self.batch_size: (index + 1) * self.batch_size]
batch_label_paths = self.label_paths[index * self.batch_size: (index + 1) * self.batch_size]
batch = zip(batch_image_paths, batch_label_paths)
return self.get_data(batch)
def on_epoch_end(self):
self.image_paths = []
self.label_paths = []
self.y_true = []
for folder in self.image_folders:
image_folder_path = os.path.join(self.dir, folder)
image_files = os.listdir(image_folder_path)
for file_name in image_files:
self.image_paths.append(os.path.join(image_folder_path, file_name))
for folder in self.label_folders:
...
if self.shuffle:
np.random.shuffle(self.image_paths)
np.random.shuffle(self.label_paths)
def get_data(self, batch):
X = np.empty((self.batch_size, *self.dim, self.n_channels))
y = np.empty((self.batch_size, self.n_classes))
for i, (image_path, label_path) in enumerate(batch):
image = np.load(image_path)
with open(label_path, &#39;r&#39;) as f:
line = f.readline().strip()
filepath, label = line.rsplit(&#39; &#39;, 1)
label = int(label)
self.y_true.append(label)
label_one_hot = to_categorical(label, num_classes=self.n_classes)
X[i,] = image
y[i,] = label_one_hot
return X, y
def get_y_true(self):
return self.y_true

Here is get y_true and y_pred ,and build confusion_matrix

At here ,"y_true = val_datagen.get_y_true()" should be put before or after this line "Y_pred = model.predict"?

train_datagen = CustomDataGenerator(image_folders, label_folders, train_dir, **params, shuffle = True)
val_datagen = CustomDataGenerator(image_folders, label_folders, valid_dir, **params, shuffle = False)
y_true = val_datagen.get_y_true()
Y_pred = model.predict(val_datagen)
y_pred = np.argmax(Y_pred, axis=1) 
fig, ax = plt.subplots(figsize=(12,6))  
sns.heatmap(confusion_matrix(y_true, y_pred),annot=True, fmt=&quot;d&quot;, cmap=&#39;Greens&#39;,ax = ax)

答案1

得分: 0

我无法测试它,但现在应该可以工作了。我将self.y_true=[]on_epoch_end()移动到__getitem__()中,只有在调用第一批时才会重置。如果在每个epoch开始时使用回调函数会更好。这只有在数据集至少被调用一次后才起作用,因为在get_data()中,图像和标签是一个接一个地加载的。但是我也不确定批量大小为1时加载了多少图像。似乎一个批次得到一个图像文件夹路径,那个文件夹中是否有多个图像?

train_dir = r'C:\Labbb\mergeimage_npy\512512\npy\train'
valid_dir = r'C:\Labbb\mergeimage_npy\512512\npy\val'
image_folders = ['image0', 'image1', 'image2', 'image3', 'image4', 'image6', 'image7']
label_folders = ['label0', 'label1', 'label2', 'label3', 'label4', 'label6', 'label7']

class CustomDataGenerator(Sequence):
    def __init__(self, image_folders, label_folders, dir, dim=(512,512),  batch_size=1,n_classes=7,n_channels=8,shuffle=True):
        self.image_folders = image_folders
        ...
        self.image_paths = []
        self.label_paths = []
        self.init_paths()
    
    def __len__(self):
        return int(np.ceil(len(self.image_paths) / self.batch_size))  
    
    def __getitem__(self, index):
        if index == 0:  # 这一行修复了问题
            self.y_true = []
        batch_image_paths = self.image_paths[index * self.batch_size: (index + 1) * self.batch_size]
        batch_label_paths = self.label_paths[index * self.batch_size: (index + 1) * self.batch_size]
        batch = zip(batch_image_paths, batch_label_paths)
        return self.get_data(batch)

    def init_paths(self):
        for folder in self.image_folders:
            image_folder_path = os.path.join(self.dir, folder)
            image_files = os.listdir(image_folder_path)
            for file_name in image_files:
                self.image_paths.append(os.path.join(image_folder_path, file_name))
        for folder in self.label_folders:
            ...
        if self.shuffle:
            np.random.shuffle(self.image_paths)
            np.random.shuffle(self.label_paths)
                
    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.image_paths)
            np.random.shuffle(self.label_paths)

    def get_data(self, batch):
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size, self.n_classes))
        
        for i, (image_path, label_path) in enumerate(batch):
            image = np.load(image_path)
            with open(label_path, 'r') as f:
                line = f.readline().strip()
                filepath, label = line.rsplit(' ', 1)
                label = int(label)
                self.y_true.append(label)
            label_one_hot = to_categorical(label, num_classes=self.n_classes)

            X[i,] = image
            y[i,] = label_one_hot
            
        return X, y
    
    def get_y_true(self):
        return self.y_true
英文:

I can't test it, but it should work now. I moved the self.y_true=[] from on_epoch_end() to __get_item__(), where it gets only reset when the first batch is called. It would be even better with a callback on epoch starts. This will only work after the dataset has been called at least once, as the images and labels are loaded one batch after the other on get_data().
But I'm alos not sure how many images are loaded with a batch size of 1. It seems one batch gets an image folder path, are there multiple images in that folder?

train_dir = r&#39;C:\Labbb\mergeimage_npy\512512\npy\train&#39;
valid_dir = r&#39;C:\Labbb\mergeimage_npy\512512\npy\val&#39;
image_folders = [&#39;image0&#39;, &#39;image1&#39;, &#39;image2&#39;, &#39;image3&#39;, &#39;image4&#39;, &#39;image6&#39;, &#39;image7&#39;]  
label_folders = [&#39;label0&#39;, &#39;label1&#39;, &#39;label2&#39;, &#39;label3&#39;, &#39;label4&#39;, &#39;label6&#39;, &#39;label7&#39;]  
class CustomDataGenerator(Sequence):
def __init__(self, image_folders, label_folders, dir, dim=(512,512),  batch_size=1,n_classes=7,n_channels=8,shuffle=True):
self.image_folders = image_folders
...
self.image_paths = []
self.label_paths = []
self.init_paths()
def __len__(self):
return int(np.ceil(len(self.image_paths) / self.batch_size))  
def __getitem__(self, index):
if index == 0:  # this line here should fix it
self.y_true = []
batch_image_paths = self.image_paths[index * self.batch_size: (index + 1) * self.batch_size]
batch_label_paths = self.label_paths[index * self.batch_size: (index + 1) * self.batch_size]
batch = zip(batch_image_paths, batch_label_paths)
return self.get_data(batch)
def init_paths(self):
for folder in self.image_folders:
image_folder_path = os.path.join(self.dir, folder)
image_files = os.listdir(image_folder_path)
for file_name in image_files:
self.image_paths.append(os.path.join(image_folder_path, file_name))
for folder in self.label_folders:
...
if self.shuffle:
np.random.shuffle(self.image_paths)
np.random.shuffle(self.label_paths)
def on_epoch_end(self):
if self.shuffle:
np.random.shuffle(self.image_paths)
np.random.shuffle(self.label_paths)
def get_data(self, batch):
X = np.empty((self.batch_size, *self.dim, self.n_channels))
y = np.empty((self.batch_size, self.n_classes))
for i, (image_path, label_path) in enumerate(batch):
image = np.load(image_path)
with open(label_path, &#39;r&#39;) as f:
line = f.readline().strip()
filepath, label = line.rsplit(&#39; &#39;, 1)
label = int(label)
self.y_true.append(label)
label_one_hot = to_categorical(label, num_classes=self.n_classes)
X[i,] = image
y[i,] = label_one_hot
return X, y
def get_y_true(self):
return self.y_true

huangapple
  • 本文由 发表于 2023年8月8日 23:34:15
  • 转载请务必保留本文链接:https://go.coder-hub.com/76861082.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定