英文:
shutil.copy doesn't seem to copy all the expected files but doesn't throw an exception
问题
# 清除工作文件夹中的先前文件(如果有的话)
if clear_existing_destination:
for item in os.listdir(dest_path):
item_path = os.path.join(dest_path, item)
if os.path.isfile(item_path):
os.remove(item_path)
elif os.path.isdir(item_path):
rmtree(item_path)
for label in os.listdir(source_path):
train_path = os.path.join(dest_path, "training", label)
val_path = os.path.join(dest_path, "validation", label)
test_path = os.path.join(dest_path, "test", label)
os.makedirs(train_path, exist_ok=True)
os.makedirs(val_path, exist_ok=True)
os.makedirs(test_path, exist_ok=True)
label_source_path = os.path.join(source_path, label)
examples_path = os.listdir(label_source_path)
examples_path = np.random.choice(examples_path, len(examples_path)) # 洗牌
total_examples = len(examples_path)
n_train_examples = int(train_split * total_examples)
n_val_examples = int(val_split * total_examples)
# n_test_examples = int(test_split * total_examples)
train_examples = examples_path[:n_train_examples]
val_examples = examples_path[n_train_examples:(n_train_examples+n_val_examples+1)]
test_examples = examples_path[(n_train_examples+n_val_examples+1):]
print(len(train_examples), len(val_examples), len(test_examples))
for file in train_examples:
source = os.path.join(label_source_path, file)
dest = os.path.join(train_path, file)
copy(source, dest)
for file in val_examples:
source = os.path.join(label_source_path, file)
dest = os.path.join(val_path, file[:-4] + "_" + label + ".jpg")
copy(source, dest)
for file in test_examples:
source = os.path.join(label_source_path, file)
dest = os.path.join(test_path, file[:-4] + "_" + label + ".jpg")
copy(source, dest)
print(len(os.listdir('/kaggle/working/training/Others/')))
输出为 70
。
英文:
I'm writing a simple function to split a simple image binary classification kaggle dataset (using a native kaggle notebook) into a training, validation and test set. For that, for each of the two classes (whose images are stored in different folders), I'm shuffling the images and then making the splits. Finally, I'm trying to copy each image into my working folder in the appropriate sub-folders.
Here's my code to do that:
def shuffle_split_data(source_path, dest_path, train_split, val_split, test_split, clear_existing_destination=True):
# remove previous files in working folder (if any)
if clear_existing_destination:
for item in os.listdir(dest_path):
item_path = os.path.join(dest_path, item)
if os.path.isfile(item_path):
os.remove(item_path)
elif os.path.isdir(item_path):
rmtree(item_path)
for label in os.listdir(source_path):
train_path = os.path.join(dest_path, "training", label)
val_path = os.path.join(dest_path, "validation", label)
test_path = os.path.join(dest_path, "test", label)
os.makedirs(train_path, exist_ok=True)
os.makedirs(val_path, exist_ok=True)
os.makedirs(test_path, exist_ok=True)
label_source_path = os.path.join(source_path, label)
examples_path = os.listdir(label_source_path)
examples_path = np.random.choice(examples_path, len(examples_path)) # shuffle
total_examples = len(examples_path)
n_train_examples = int(train_split * total_examples)
n_val_examples = int(val_split * total_examples)
# n_test_examples = int(test_split * total_examples)
train_examples = examples_path[:n_train_examples]
val_examples = examples_path[n_train_examples:(n_train_examples+n_val_examples+1)]
test_examples = examples_path[(n_train_examples+n_val_examples+1):]
print(len(train_examples), len(val_examples), len(test_examples))
for file in train_examples:
source = os.path.join(label_source_path, file)
dest = os.path.join(train_path, file)
copy(source, dest)
for file in val_examples:
source = os.path.join(label_source_path, file)
dest = os.path.join(val_path, file[:-4] + "_" + label + ".jpg")
copy(source, dest)
for file in test_examples:
source = os.path.join(label_source_path, file)
dest = os.path.join(test_path, file[:-4] + "_" + label + ".jpg")
copy(source, dest)
shuffle_split_data(
'/kaggle/input/monkeypox-skin-lesion-dataset/Original Images/Original Images',
'/kaggle/working/',
train_split,
val_split,
test_split,
clear_existing_destination=True
)
The output to console is simply:
100 13 13
81 11 10
which are the expected number of images that the training, validation and test sets are expected to have for each of the labels/classes.
I'm assuming that shutil.copy isn't copying all the images properly, because when I examine my training folder for the first class (the one that should have 100 images for training) using the following code:
print(len(os.listdir('/kaggle/working/training/Others/')))
The output is 70
. For some reason, this number changes every time that I run the function above.
What am I missing? I assume the error must be really dumb, but I've been debugging and trying to locate it for the past hours and haven't been able to make any progress. Thank you in advance!
答案1
得分: 2
问题在于np.random.choice
的调用方式。
使用默认参数replace=True
,执行的是“有放回抽样”,即“可以多次选择a
的值”(文档)。
所以在这行之后:
examples_path = np.random.choice(examples_path, len(examples_path))
数组examples_path
包含与之前相同数量的文件名,但某些文件名将重复,而其他文件名将缺失。
一种修复方法是提供参数replace=False
:
examples_path = np.random.choice(examples_path, len(examples_path), replace=False)
但更清晰和更简洁的做法可能是切换到函数np.random.shuffle
:
np.random.shuffle(examples_path)
这会原地对数组进行洗牌(即不会返回结果,它修改现有的examples_path
),并且旨在完全满足你在这里所需的要求。
英文:
The problem here is the way that np.random.choice
is called.
With the default parameter replace=True
, "sampling with replacement" is performed, i.e. "a value of a
can be selected multiple times" (documentation).
So after the line:
examples_path = np.random.choice(examples_path, len(examples_path))
the array examples_path
contains the same number of filenames it did before, but some filenames will be duplicated and others will be missing.
One fix is to supply the parameter replace=False
:
examples_path = np.random.choice(examples_path, len(examples_path), replace=False)
But it's probably clearer and more concise to switch to the function np.random.shuffle
instead:
np.random.shuffle(examples_path)
This shuffles the array in place (i.e. it doesn't return a result, it modifies the existing examples_path
) and is intended to do exactly what you need here.
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论