问题

我的TensorFlow和Keras LSTM模型在我尝试扩展数据集到3.95 MB后，在尝试启动Kaggle进行训练时每次都因RAM超载而崩溃。我发现数据集太大，即使使用数据加载器一次性加载，也会导致训练失败。我已经寻找了解决方案，但找不到一个。任何支持都将不胜感激。

from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.callbacks import LambdaCallback, ModelCheckpoint, ReduceLROnPlateau
import random
import sys
with open('/kaggle/input/crptic-python/dataset.txt', 'r') as file:
    text = file.read()
# 文本文件的预览
vocabulary = sorted(list(set(text)))
char_to_indices = dict((c, i) for i, c in enumerate(vocabulary))
indices_to_char = dict((i, c) for i, c in enumerate(vocabulary))
# 将文本分成长度为max_length的子序列
# 这样，在每个时间步骤，下一个max_length个字符会被馈送到网络中
max_length = 100
steps = 5
sentences = []
next_chars = []
for i in range(0, len(text) - max_length, steps):
    sentences.append(text[i: i + max_length])
    next_chars.append(text[i + max_length])
# 将每个字符热编码为布尔向量
X = np.zeros((len(sentences), max_length, len(vocabulary)), dtype=bool)
y = np.zeros((len(sentences), len(vocabulary)), dtype=bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_to_indices[char]] = 1
    y[i, char_to_indices[next_chars[i]]] = 1
# 为任务构建LSTM网络
model = Sequential()
model.add(LSTM(128, input_shape=(max_length, len(vocabulary))))
model.add(Dense(len(vocabulary)))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
# 用于从概率数组中抽样索引的辅助函数
def sample_index(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)
# 在每个时代结束后生成文本的辅助函数
def on_epoch_end(epoch, logs):
    if epoch % 30 == 0:
        print()
        print('----- Generating text after Epoch: %d' % epoch)
        start_index = random.randint(0, len(text) - max_length - 1)
        for diversity in [0.2, 0.5, 1.0, 1.2]:
            print('----- diversity:', diversity)
            generated = ''
            sentence = text[start_index: start_index + max_length]
            generated += sentence
            print('----- Generating with seed: "' + sentence + '"')
            sys.stdout.write(generated)
            for i in range(400):
                x_pred = np.zeros((1, max_length, len(vocabulary)))
                for t, char in enumerate(sentence):
                    x_pred[0, t, char_to_indices[char]] = 1.
                preds = model.predict(x_pred, verbose=0)[0]
                next_index = sample_index(preds, diversity)
                next_char = indices_to_char[next_index]
                generated += next_char
                sentence = sentence[1:] + next_char
                sys.stdout.write(next_char)
                sys.stdout.flush()
            print()
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
# 定义一个在每个损失减小的时代之后保存模型的辅助函数
filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss',
                             verbose=1, save_best_only=True,
                             mode='min')
# 定义一个在学习停滞时降低学习率的辅助函数
reduce_alpha = ReduceLROnPlateau(monitor='loss', factor=0.2,
                                 patience=1, min_lr=0.001)
callbacks = [print_callback, checkpoint, reduce_alpha]
# 训练LSTM模型
model.fit(X, y, batch_size=128, epochs=28, callbacks=callbacks)
def generate_text(length, diversity):
    # 获取随机的起始文本
    start_index = random.randint(0, len(text) - max_length - 1)
    generated = ''
    sentence = text[start_index: start_index + max_length]
    generated += sentence
    for i in range(length):
        x_pred = np.zeros((1, max_length, len(vocabulary)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_to_indices[char]] = 1.
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample_index(preds, diversity)
        next_char = indices_to_char[next_index]
        generated += next_char
        sentence = sentence[1:] + next_char
    return generated
print(generate_text(500, 0.5))

英文:

My TensorFlow and Keras LSTM model is crashing from a RAM overload each time I try turning on Kaggle to begin the training process after expanding the dataset to 3.95 MB. I found that the dataset is too heavy to load in at once, even with a dataloader, which messed up training. I have searched for a solution, but I cannot find one. Any support would be much appreciated.

from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.callbacks import LambdaCallback, ModelCheckpoint, ReduceLROnPlateau
import random
import sys
with open(&#39;/kaggle/input/crptic-python/dataset.txt&#39;, &#39;r&#39;) as file:
    text = file.read()
# A preview of the text file
vocabulary = sorted(list(set(text)))
char_to_indices = dict((c, i) for i, c in enumerate(vocabulary))
indices_to_char = dict((i, c) for i, c in enumerate(vocabulary))
# Dividing the text into subsequences of length max_length
# So that at each time step the next max_length characters
# are fed into the network
max_length = 100
steps = 5
sentences = []
next_chars = []
for i in range(0, len(text) - max_length, steps):
    sentences.append(text[i: i + max_length])
    next_chars.append(text[i + max_length])
# Hot encoding each character into a boolean vector
X = np.zeros((len(sentences), max_length, len(vocabulary)), dtype=bool)
y = np.zeros((len(sentences), len(vocabulary)), dtype=bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_to_indices[char]] = 1
    y[i, char_to_indices[next_chars[i]]] = 1
# Building the LSTM network for the task
model = Sequential()
model.add(LSTM(128, input_shape=(max_length, len(vocabulary))))
model.add(Dense(len(vocabulary)))
model.add(Activation(&#39;softmax&#39;))
optimizer = RMSprop(lr=0.01)
model.compile(loss=&#39;categorical_crossentropy&#39;, optimizer=optimizer)
# Helper function to sample an index from a probability array
def sample_index(preds, temperature=1.0):
    preds = np.asarray(preds).astype(&#39;float64&#39;)
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)
# Helper function to generate text after the end of each epoch
def on_epoch_end(epoch, logs):
    if epoch % 30 == 0:
        print()
        print(&#39;----- Generating text after Epoch: % d&#39; % epoch)
        start_index = random.randint(0, len(text) - max_length - 1)
        for diversity in [0.2, 0.5, 1.0, 1.2]:
            print(&#39;----- diversity:&#39;, diversity)
            generated = &#39;&#39;
            sentence = text[start_index: start_index + max_length]
            generated += sentence
            print(&#39;----- Generating with seed: &quot;&#39; + sentence + &#39;&quot;&#39;)
            sys.stdout.write(generated)
            for i in range(400):
                x_pred = np.zeros((1, max_length, len(vocabulary)))
                for t, char in enumerate(sentence):
                    x_pred[0, t, char_to_indices[char]] = 1.
                preds = model.predict(x_pred, verbose=0)[0]
                next_index = sample_index(preds, diversity)
                next_char = indices_to_char[next_index]
                generated += next_char
                sentence = sentence[1:] + next_char
                sys.stdout.write(next_char)
                sys.stdout.flush()
            print()
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
# Defining a helper function to save the model after each epoch
# in which the loss decreases
filepath = &quot;weights.hdf5&quot;
checkpoint = ModelCheckpoint(filepath, monitor=&#39;loss&#39;,
                             verbose=1, save_best_only=True,
                             mode=&#39;min&#39;)
# Defining a helper function to reduce the learning rate each time
# the learning plateaus
reduce_alpha = ReduceLROnPlateau(monitor=&#39;loss&#39;, factor=0.2,
                                 patience=1, min_lr=0.001)
callbacks = [print_callback, checkpoint, reduce_alpha]
# Training the LSTM model
model.fit(X, y, batch_size=128, epochs=28, callbacks=callbacks)
def generate_text(length, diversity):
    # Get random starting text
    start_index = random.randint(0, len(text) - max_length - 1)
    generated = &#39;&#39;
    sentence = text[start_index: start_index + max_length]
    generated += sentence
    for i in range(length):
        x_pred = np.zeros((1, max_length, len(vocabulary)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_to_indices[char]] = 1.
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample_index(preds, diversity)
        next_char = indices_to_char[next_index]
        generated += next_char
        sentence = sentence[1:] + next_char
    return generated
print(generate_text(500, 0.5))

Dataset is a text file with unorganized words and phrases. This AI serves as an autocomplete implementation.

答案1

得分: 0

你可以始终编写自定义的Python生成器来高效地将数据加载到模型中，使用关键字yield。

我不知道你是否曾经实现过类似的东西，但构建这个生成器的概念很直观。因此，编写一个函数，里面包含你需要的所有参数，比如文件路径、批处理大小、长度词汇等。然后进行一个无限循环，在循环内部编写代码/逻辑，将数据加载/写入批次，然后将它们推送到模型中。

我写了这个代码片段，但肯定不够准确，你需要检查一下：

def text_generator(batch_size, max_length, steps, len_vocabulary):
    while True:
        with open('/kaggle/input/crptic-python/dataset.txt', 'r') as file:
            text = file.read()
            sentences = []
            next_chars = []
            for i in range(0, len(text) - max_length, steps):
                sentences.append(text[i: i + max_length])
                next_chars.append(text[i + max_length])
                if len(sentences) == batch_size:
                    # 将每个字符热编码为布尔向量
                    X = np.zeros((len(sentences), max_length, len_vocabulary), dtype=bool)
                    y = np.zeros((len(sentences), len_vocabulary), dtype=bool)
                    for i, sentence in enumerate(sentences):
                        for t, char in enumerate(sentence):
                            X[i, t, char_to_indices[char]] = 1
                        y[i, char_to_indices[next_chars[i]]] = 1
                    yield X, y
                    sentence = []
                    next_chars = []

英文:

Well you can always write your custom python generator to load your data into the model efficiently by using the keyword yield.
I don't know if you ever have implemented something similar, but the concept is intuitive in building this generator. So write a function with all the parameters that you need inside it like path to file, batch_size, length vocabulary, ... Then do an infinite loop and inside it write your code/logic to load/write data into batches and then push them to the model.

I wrote this snippet but sure it is not accurate enough and you need to review it:

def text_generator(batch_size, max_length, steps, len_vocabulary):
while True:
with open(&#39;/kaggle/input/crptic-python/dataset.txt&#39;, &#39;r&#39;) as file:
text = file.read()
sentences = []
next_chars = []
for i in range(0, len(text) - max_length, steps):
sentences.append(text[i: i + max_length])
next_chars.append(text[i + max_length])
if len(sentences) == batch_size:
# Hot encoding each character into a boolean vector
X = np.zeros((len(sentences), max_length, len_vocabulary), dtype=bool)
y = np.zeros((len(sentences), len_vocabulary), dtype=bool)
for i, sentence in enumerate(sentences):
for t, char in enumerate(sentence):
X[i, t, char_to_indices[char]] = 1
y[i, char_to_indices[next_chars[i]]] = 1
yield X, y
sentence = []
next_chars = []

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

如何在Python中加载数据集并处理它，而不会超出内存限制？

问题

答案1

How do I melt and/or pivot my pandas dataframe in way that forces x-axis titles to become index? (see description for visual)

如何在Python中替换嵌套列表中的元素

Updating existing Excel file with Pandas and Openpyxl throws an AttributeError: property 'book' of 'OpenpyxlWriter' object has no setter

如何编写一个批处理文件，以打开命令行并自动执行其中的命令？

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。