如何在Python中加载数据集并处理它,而不会超出内存限制?

huangapple go评论69阅读模式
英文:

How do I load a dataset and process it without overloading RAM in Python?

问题

我的TensorFlow和Keras LSTM模型在我尝试扩展数据集到3.95 MB后,在尝试启动Kaggle进行训练时每次都因RAM超载而崩溃。我发现数据集太大,即使使用数据加载器一次性加载,也会导致训练失败。我已经寻找了解决方案,但找不到一个。任何支持都将不胜感激。

from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import tensorflow as tf

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.callbacks import LambdaCallback, ModelCheckpoint, ReduceLROnPlateau
import random
import sys

with open('/kaggle/input/crptic-python/dataset.txt', 'r') as file:
    text = file.read()

# 文本文件的预览
vocabulary = sorted(list(set(text)))

char_to_indices = dict((c, i) for i, c in enumerate(vocabulary))
indices_to_char = dict((i, c) for i, c in enumerate(vocabulary))

# 将文本分成长度为max_length的子序列
# 这样,在每个时间步骤,下一个max_length个字符会被馈送到网络中
max_length = 100
steps = 5
sentences = []
next_chars = []
for i in range(0, len(text) - max_length, steps):
    sentences.append(text[i: i + max_length])
    next_chars.append(text[i + max_length])

# 将每个字符热编码为布尔向量
X = np.zeros((len(sentences), max_length, len(vocabulary)), dtype=bool)
y = np.zeros((len(sentences), len(vocabulary)), dtype=bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_to_indices[char]] = 1
    y[i, char_to_indices[next_chars[i]]] = 1

# 为任务构建LSTM网络
model = Sequential()
model.add(LSTM(128, input_shape=(max_length, len(vocabulary))))
model.add(Dense(len(vocabulary)))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

# 用于从概率数组中抽样索引的辅助函数
def sample_index(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# 在每个时代结束后生成文本的辅助函数
def on_epoch_end(epoch, logs):
    if epoch % 30 == 0:
        print()
        print('----- Generating text after Epoch: %d' % epoch)

        start_index = random.randint(0, len(text) - max_length - 1)
        for diversity in [0.2, 0.5, 1.0, 1.2]:
            print('----- diversity:', diversity)

            generated = ''
            sentence = text[start_index: start_index + max_length]
            generated += sentence
            print('----- Generating with seed: "' + sentence + '"')
            sys.stdout.write(generated)

            for i in range(400):
                x_pred = np.zeros((1, max_length, len(vocabulary)))
                for t, char in enumerate(sentence):
                    x_pred[0, t, char_to_indices[char]] = 1.

                preds = model.predict(x_pred, verbose=0)[0]
                next_index = sample_index(preds, diversity)
                next_char = indices_to_char[next_index]

                generated += next_char
                sentence = sentence[1:] + next_char

                sys.stdout.write(next_char)
                sys.stdout.flush()
            print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

# 定义一个在每个损失减小的时代之后保存模型的辅助函数
filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss',
                             verbose=1, save_best_only=True,
                             mode='min')

# 定义一个在学习停滞时降低学习率的辅助函数
reduce_alpha = ReduceLROnPlateau(monitor='loss', factor=0.2,
                                 patience=1, min_lr=0.001)
callbacks = [print_callback, checkpoint, reduce_alpha]

# 训练LSTM模型
model.fit(X, y, batch_size=128, epochs=28, callbacks=callbacks)

def generate_text(length, diversity):
    # 获取随机的起始文本
    start_index = random.randint(0, len(text) - max_length - 1)
    generated = ''
    sentence = text[start_index: start_index + max_length]
    generated += sentence
    for i in range(length):
        x_pred = np.zeros((1, max_length, len(vocabulary)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_to_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample_index(preds, diversity)
        next_char = indices_to_char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char
    return generated

print(generate_text(500, 0.5))
英文:

My TensorFlow and Keras LSTM model is crashing from a RAM overload each time I try turning on Kaggle to begin the training process after expanding the dataset to 3.95 MB. I found that the dataset is too heavy to load in at once, even with a dataloader, which messed up training. I have searched for a solution, but I cannot find one. Any support would be much appreciated.

from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import tensorflow as tf

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.callbacks import LambdaCallback, ModelCheckpoint, ReduceLROnPlateau
import random
import sys

with open('/kaggle/input/crptic-python/dataset.txt', 'r') as file:
    text = file.read()

# A preview of the text file
vocabulary = sorted(list(set(text)))

char_to_indices = dict((c, i) for i, c in enumerate(vocabulary))
indices_to_char = dict((i, c) for i, c in enumerate(vocabulary))

# Dividing the text into subsequences of length max_length
# So that at each time step the next max_length characters
# are fed into the network
max_length = 100
steps = 5
sentences = []
next_chars = []
for i in range(0, len(text) - max_length, steps):
    sentences.append(text[i: i + max_length])
    next_chars.append(text[i + max_length])

# Hot encoding each character into a boolean vector
X = np.zeros((len(sentences), max_length, len(vocabulary)), dtype=bool)
y = np.zeros((len(sentences), len(vocabulary)), dtype=bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_to_indices[char]] = 1
    y[i, char_to_indices[next_chars[i]]] = 1

# Building the LSTM network for the task
model = Sequential()
model.add(LSTM(128, input_shape=(max_length, len(vocabulary))))
model.add(Dense(len(vocabulary)))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

# Helper function to sample an index from a probability array
def sample_index(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# Helper function to generate text after the end of each epoch
def on_epoch_end(epoch, logs):
    if epoch % 30 == 0:
        print()
        print('----- Generating text after Epoch: % d' % epoch)

        start_index = random.randint(0, len(text) - max_length - 1)
        for diversity in [0.2, 0.5, 1.0, 1.2]:
            print('----- diversity:', diversity)

            generated = ''
            sentence = text[start_index: start_index + max_length]
            generated += sentence
            print('----- Generating with seed: "' + sentence + '"')
            sys.stdout.write(generated)

            for i in range(400):
                x_pred = np.zeros((1, max_length, len(vocabulary)))
                for t, char in enumerate(sentence):
                    x_pred[0, t, char_to_indices[char]] = 1.

                preds = model.predict(x_pred, verbose=0)[0]
                next_index = sample_index(preds, diversity)
                next_char = indices_to_char[next_index]

                generated += next_char
                sentence = sentence[1:] + next_char

                sys.stdout.write(next_char)
                sys.stdout.flush()
            print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

# Defining a helper function to save the model after each epoch
# in which the loss decreases
filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss',
                             verbose=1, save_best_only=True,
                             mode='min')

# Defining a helper function to reduce the learning rate each time
# the learning plateaus
reduce_alpha = ReduceLROnPlateau(monitor='loss', factor=0.2,
                                 patience=1, min_lr=0.001)
callbacks = [print_callback, checkpoint, reduce_alpha]

# Training the LSTM model
model.fit(X, y, batch_size=128, epochs=28, callbacks=callbacks)

def generate_text(length, diversity):
    # Get random starting text
    start_index = random.randint(0, len(text) - max_length - 1)
    generated = ''
    sentence = text[start_index: start_index + max_length]
    generated += sentence
    for i in range(length):
        x_pred = np.zeros((1, max_length, len(vocabulary)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_to_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample_index(preds, diversity)
        next_char = indices_to_char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char
    return generated

print(generate_text(500, 0.5))

Dataset is a text file with unorganized words and phrases. This AI serves as an autocomplete implementation.

答案1

得分: 0

你可以始终编写自定义的Python生成器来高效地将数据加载到模型中,使用关键字yield

我不知道你是否曾经实现过类似的东西,但构建这个生成器的概念很直观。因此,编写一个函数,里面包含你需要的所有参数,比如文件路径、批处理大小、长度词汇等。然后进行一个无限循环,在循环内部编写代码/逻辑,将数据加载/写入批次,然后将它们推送到模型中。

我写了这个代码片段,但肯定不够准确,你需要检查一下:

def text_generator(batch_size, max_length, steps, len_vocabulary):
    while True:
        with open('/kaggle/input/crptic-python/dataset.txt', 'r') as file:
            text = file.read()
            sentences = []
            next_chars = []
            for i in range(0, len(text) - max_length, steps):
                sentences.append(text[i: i + max_length])
                next_chars.append(text[i + max_length])
                if len(sentences) == batch_size:

                    # 将每个字符热编码为布尔向量
                    X = np.zeros((len(sentences), max_length, len_vocabulary), dtype=bool)
                    y = np.zeros((len(sentences), len_vocabulary), dtype=bool)
                    for i, sentence in enumerate(sentences):
                        for t, char in enumerate(sentence):
                            X[i, t, char_to_indices[char]] = 1
                        y[i, char_to_indices[next_chars[i]]] = 1
                    yield X, y
                    sentence = []
                    next_chars = []
英文:

Well you can always write your custom python generator to load your data into the model efficiently by using the keyword yield.
I don't know if you ever have implemented something similar, but the concept is intuitive in building this generator. So write a function with all the parameters that you need inside it like path to file, batch_size, length vocabulary, ... Then do an infinite loop and inside it write your code/logic to load/write data into batches and then push them to the model.

I wrote this snippet but sure it is not accurate enough and you need to review it:

def text_generator(batch_size, max_length, steps, len_vocabulary):
while True:
with open('/kaggle/input/crptic-python/dataset.txt', 'r') as file:
text = file.read()
sentences = []
next_chars = []
for i in range(0, len(text) - max_length, steps):
sentences.append(text[i: i + max_length])
next_chars.append(text[i + max_length])
if len(sentences) == batch_size:
# Hot encoding each character into a boolean vector
X = np.zeros((len(sentences), max_length, len_vocabulary), dtype=bool)
y = np.zeros((len(sentences), len_vocabulary), dtype=bool)
for i, sentence in enumerate(sentences):
for t, char in enumerate(sentence):
X[i, t, char_to_indices[char]] = 1
y[i, char_to_indices[next_chars[i]]] = 1
yield X, y
sentence = []
next_chars = []

huangapple
  • 本文由 发表于 2023年6月16日 07:11:30
  • 转载请务必保留本文链接:https://go.coder-hub.com/76486031.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定