英文:
How to get correct confusion_matrix data in customdatagenerator
问题
我正在构建混淆矩阵,但是我总是返回错误形状的y_true。
我认为我的y_label是正确的,我有62个验证数据。
我不知道y_true应该在哪里声明,以及如何获取y_true。
ValueError
Found input variables with inconsistent numbers of samples: [63, 62]
File "C:\Labbb\inceptionResnetV2\InceptionResnetV2_1.py", line 213, in <module>
sns.heatmap(confusion_matrix(y_true, y_pred),
ValueError: Found input variables with inconsistent numbers of samples: [63, 62]
我尝试在get_data中添加self.y_true,使用def get_y_true返回self.y_true,并在on_epoch_end中使用"self.y_true = []",shuffle=False。
这是CustomDataGenerator的代码。
我应该在哪里声明"self.y_true = []"?
train_dir = r'C:\Labbb\mergeimage_npy\512512\npy\train'
valid_dir = r'C:\Labbb\mergeimage_npy\512512\npy\val'
image_folders = ['image0', 'image1', 'image2', 'image3', 'image4', 'image6', 'image7']
label_folders = ['label0', 'label1', 'label2', 'label3', 'label4', 'label6', 'label7']
class CustomDataGenerator(Sequence):
def __init__(self, image_folders, label_folders, dir, dim=(512,512), batch_size=1,n_classes=7,n_channels=8,shuffle=True):
self.image_folders = image_folders
...
self.image_paths = []
self.label_paths = []
self.on_epoch_end()
def __len__(self):
return int(np.ceil(len(self.image_paths) / self.batch_size))
def __getitem__(self, index):
batch_image_paths = self.image_paths[index * self.batch_size: (index + 1) * self.batch_size]
batch_label_paths = self.label_paths[index * self.batch_size: (index + 1) * self.batch_size]
batch = zip(batch_image_paths, batch_label_paths)
return self.get_data(batch)
def on_epoch_end(self):
self.image_paths = []
self.label_paths = []
self.y_true = []
for folder in self.image_folders:
image_folder_path = os.path.join(self.dir, folder)
image_files = os.listdir(image_folder_path)
for file_name in image_files:
self.image_paths.append(os.path.join(image_folder_path, file_name))
for folder in self.label_folders:
...
if self.shuffle:
np.random.shuffle(self.image_paths)
np.random.shuffle(self.label_paths)
def get_data(self, batch):
X = np.empty((self.batch_size, *self.dim, self.n_channels))
y = np.empty((self.batch_size, self.n_classes))
for i, (image_path, label_path) in enumerate(batch):
image = np.load(image_path)
with open(label_path, 'r') as f:
line = f.readline().strip()
filepath, label = line.rsplit(' ', 1)
label = int(label)
self.y_true.append(label)
label_one_hot = to_categorical(label, num_classes=self.n_classes)
X[i,] = image
y[i,] = label_one_hot
return X, y
def get_y_true(self):
return self.y_true
这是获取y_true和y_pred,并构建混淆矩阵的代码。
在这里,"y_true = val_datagen.get_y_true()"应该放在这行代码"Y_pred = model.predict"之前还是之后?
train_datagen = CustomDataGenerator(image_folders, label_folders, train_dir, **params, shuffle = True)
val_datagen = CustomDataGenerator(image_folders, label_folders, valid_dir, **params, shuffle = False)
y_true = val_datagen.get_y_true()
Y_pred = model.predict(val_datagen)
y_pred = np.argmax(Y_pred, axis=1)
fig, ax = plt.subplots(figsize=(12,6))
sns.heatmap(confusion_matrix(y_true, y_pred),annot=True, fmt="d", cmap='Greens',ax = ax)
英文:
I'm building confusion_matrix, but I always return wrong shape y_true
I think my y_label is correct, I have 62 val data
I dont know y_true should be declare at where and where to get y_true
ValueError
Found input variables with inconsistent numbers of samples: [63, 62]
File "C:\Labbb\inceptionResnetV2\InceptionResnetV2_1.py", line 213, in <module>
sns.heatmap(confusion_matrix(y_true, y_pred),
ValueError: Found input variables with inconsistent numbers of samples: [63, 62]
I try to append self.y_true in get_data, use def get_y_true return self.y_true, and "self.y_true = []" in on_epoch_end, shuffle=False.
Here is CustomDataGenerator.
Where should I declare "self.y_true = []"
train_dir = r'C:\Labbb\mergeimage_npy\512512\npy\train'
valid_dir = r'C:\Labbb\mergeimage_npy\512512\npy\val'
image_folders = ['image0', 'image1', 'image2', 'image3', 'image4', 'image6', 'image7']
label_folders = ['label0', 'label1', 'label2', 'label3', 'label4', 'label6', 'label7']
class CustomDataGenerator(Sequence):
def __init__(self, image_folders, label_folders, dir, dim=(512,512), batch_size=1,n_classes=7,n_channels=8,shuffle=True):
self.image_folders = image_folders
...
self.image_paths = []
self.label_paths = []
self.on_epoch_end()
def __len__(self):
return int(np.ceil(len(self.image_paths) / self.batch_size))
def __getitem__(self, index):
batch_image_paths = self.image_paths[index * self.batch_size: (index + 1) * self.batch_size]
batch_label_paths = self.label_paths[index * self.batch_size: (index + 1) * self.batch_size]
batch = zip(batch_image_paths, batch_label_paths)
return self.get_data(batch)
def on_epoch_end(self):
self.image_paths = []
self.label_paths = []
self.y_true = []
for folder in self.image_folders:
image_folder_path = os.path.join(self.dir, folder)
image_files = os.listdir(image_folder_path)
for file_name in image_files:
self.image_paths.append(os.path.join(image_folder_path, file_name))
for folder in self.label_folders:
...
if self.shuffle:
np.random.shuffle(self.image_paths)
np.random.shuffle(self.label_paths)
def get_data(self, batch):
X = np.empty((self.batch_size, *self.dim, self.n_channels))
y = np.empty((self.batch_size, self.n_classes))
for i, (image_path, label_path) in enumerate(batch):
image = np.load(image_path)
with open(label_path, 'r') as f:
line = f.readline().strip()
filepath, label = line.rsplit(' ', 1)
label = int(label)
self.y_true.append(label)
label_one_hot = to_categorical(label, num_classes=self.n_classes)
X[i,] = image
y[i,] = label_one_hot
return X, y
def get_y_true(self):
return self.y_true
Here is get y_true and y_pred ,and build confusion_matrix
At here ,"y_true = val_datagen.get_y_true()" should be put before or after this line "Y_pred = model.predict"?
train_datagen = CustomDataGenerator(image_folders, label_folders, train_dir, **params, shuffle = True)
val_datagen = CustomDataGenerator(image_folders, label_folders, valid_dir, **params, shuffle = False)
y_true = val_datagen.get_y_true()
Y_pred = model.predict(val_datagen)
y_pred = np.argmax(Y_pred, axis=1)
fig, ax = plt.subplots(figsize=(12,6))
sns.heatmap(confusion_matrix(y_true, y_pred),annot=True, fmt="d", cmap='Greens',ax = ax)
答案1
得分: 0
我无法测试它,但现在应该可以工作了。我将self.y_true=[]
从on_epoch_end()
移动到__getitem__()
中,只有在调用第一批时才会重置。如果在每个epoch开始时使用回调函数会更好。这只有在数据集至少被调用一次后才起作用,因为在get_data()
中,图像和标签是一个接一个地加载的。但是我也不确定批量大小为1时加载了多少图像。似乎一个批次得到一个图像文件夹路径,那个文件夹中是否有多个图像?
train_dir = r'C:\Labbb\mergeimage_npy\512512\npy\train'
valid_dir = r'C:\Labbb\mergeimage_npy\512512\npy\val'
image_folders = ['image0', 'image1', 'image2', 'image3', 'image4', 'image6', 'image7']
label_folders = ['label0', 'label1', 'label2', 'label3', 'label4', 'label6', 'label7']
class CustomDataGenerator(Sequence):
def __init__(self, image_folders, label_folders, dir, dim=(512,512), batch_size=1,n_classes=7,n_channels=8,shuffle=True):
self.image_folders = image_folders
...
self.image_paths = []
self.label_paths = []
self.init_paths()
def __len__(self):
return int(np.ceil(len(self.image_paths) / self.batch_size))
def __getitem__(self, index):
if index == 0: # 这一行修复了问题
self.y_true = []
batch_image_paths = self.image_paths[index * self.batch_size: (index + 1) * self.batch_size]
batch_label_paths = self.label_paths[index * self.batch_size: (index + 1) * self.batch_size]
batch = zip(batch_image_paths, batch_label_paths)
return self.get_data(batch)
def init_paths(self):
for folder in self.image_folders:
image_folder_path = os.path.join(self.dir, folder)
image_files = os.listdir(image_folder_path)
for file_name in image_files:
self.image_paths.append(os.path.join(image_folder_path, file_name))
for folder in self.label_folders:
...
if self.shuffle:
np.random.shuffle(self.image_paths)
np.random.shuffle(self.label_paths)
def on_epoch_end(self):
if self.shuffle:
np.random.shuffle(self.image_paths)
np.random.shuffle(self.label_paths)
def get_data(self, batch):
X = np.empty((self.batch_size, *self.dim, self.n_channels))
y = np.empty((self.batch_size, self.n_classes))
for i, (image_path, label_path) in enumerate(batch):
image = np.load(image_path)
with open(label_path, 'r') as f:
line = f.readline().strip()
filepath, label = line.rsplit(' ', 1)
label = int(label)
self.y_true.append(label)
label_one_hot = to_categorical(label, num_classes=self.n_classes)
X[i,] = image
y[i,] = label_one_hot
return X, y
def get_y_true(self):
return self.y_true
英文:
I can't test it, but it should work now. I moved the self.y_true=[]
from on_epoch_end()
to __get_item__()
, where it gets only reset when the first batch is called. It would be even better with a callback on epoch starts. This will only work after the dataset has been called at least once, as the images and labels are loaded one batch after the other on get_data()
.
But I'm alos not sure how many images are loaded with a batch size of 1. It seems one batch gets an image folder path, are there multiple images in that folder?
train_dir = r'C:\Labbb\mergeimage_npy\512512\npy\train'
valid_dir = r'C:\Labbb\mergeimage_npy\512512\npy\val'
image_folders = ['image0', 'image1', 'image2', 'image3', 'image4', 'image6', 'image7']
label_folders = ['label0', 'label1', 'label2', 'label3', 'label4', 'label6', 'label7']
class CustomDataGenerator(Sequence):
def __init__(self, image_folders, label_folders, dir, dim=(512,512), batch_size=1,n_classes=7,n_channels=8,shuffle=True):
self.image_folders = image_folders
...
self.image_paths = []
self.label_paths = []
self.init_paths()
def __len__(self):
return int(np.ceil(len(self.image_paths) / self.batch_size))
def __getitem__(self, index):
if index == 0: # this line here should fix it
self.y_true = []
batch_image_paths = self.image_paths[index * self.batch_size: (index + 1) * self.batch_size]
batch_label_paths = self.label_paths[index * self.batch_size: (index + 1) * self.batch_size]
batch = zip(batch_image_paths, batch_label_paths)
return self.get_data(batch)
def init_paths(self):
for folder in self.image_folders:
image_folder_path = os.path.join(self.dir, folder)
image_files = os.listdir(image_folder_path)
for file_name in image_files:
self.image_paths.append(os.path.join(image_folder_path, file_name))
for folder in self.label_folders:
...
if self.shuffle:
np.random.shuffle(self.image_paths)
np.random.shuffle(self.label_paths)
def on_epoch_end(self):
if self.shuffle:
np.random.shuffle(self.image_paths)
np.random.shuffle(self.label_paths)
def get_data(self, batch):
X = np.empty((self.batch_size, *self.dim, self.n_channels))
y = np.empty((self.batch_size, self.n_classes))
for i, (image_path, label_path) in enumerate(batch):
image = np.load(image_path)
with open(label_path, 'r') as f:
line = f.readline().strip()
filepath, label = line.rsplit(' ', 1)
label = int(label)
self.y_true.append(label)
label_one_hot = to_categorical(label, num_classes=self.n_classes)
X[i,] = image
y[i,] = label_one_hot
return X, y
def get_y_true(self):
return self.y_true
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论