英文:
Avoid allocating a large amount of memory for concatenation in C programming
问题
以下是您提供的代码的翻译部分:
提供的代码从文本文件中读取行,并将它们存储在名为lines的动态分配的二维数组中(该模式在第32767行后始终重复)。然后它连接特定的行(lines[j]、lines[k]、lines[m])并将结果打印到输出文件中。问题是我的input.txt文件非常大,达到了100GB,以至于我的电脑无法处理它。因为我使用malloc为二维数组分配了如此多的内存,以至于我的硬盘已经满了。是否有可能修改我的代码,以便我不必分配如此多的内存?感谢任何帮助!
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_LINE_LENGTH 300 // 指定输入文件中行的最大长度
#define MAX_LINES 500000000 // 指定输入文件的最大行数
int main()
{
// 打开文件
FILE *fp = fopen("input.txt", "r");
if (fp == NULL)
{
printf("无法打开文件。\n");
return 1;
}
// 为lines分配内存
char **lines = (char **)malloc(MAX_LINES * sizeof(char *));
if (lines == NULL)
{
printf("无法为lines分配内存。\n");
fclose(fp);
return 1;
}
for (int i = 0; i < MAX_LINES; i++)
{
lines[i] = (char *)malloc(MAX_LINE_LENGTH * sizeof(char));
if (lines[i] == NULL)
{
printf("无法为第%d行分配内存。\n", i);
// 释放之前分配的lines
for (int j = 0; j < i; j++)
{
free(lines[j]);
}
free(lines);
fclose(fp);
return 1;
}
}
// 将文件指针重置到开头
fseek(fp, 0, SEEK_SET);
// 从文件中读取行并存储在lines数组中
int lineIndex = 0;
char line[MAX_LINE_LENGTH];
while (fgets(line, sizeof(line), fp) != NULL)
{
// 从行末尾删除换行字符
if (line[strlen(line) - 1] == '\n')
line[strlen(line) - 1] = '\0';
strcpy(lines[lineIndex], line);
lineIndex++;
}
fclose(fp);
// 创建输出文件
FILE *outputFile = fopen("output.txt", "w");
if (outputFile == NULL)
{
perror("创建输出文件时出错");
// 释放分配的lines
for (int i = 0; i < MAX_LINES; i++)
{
free(lines[i]);
}
free(lines);
return 1;
}
int i=1;
int j=5; //开始Permno编号
int k=10927; //开始PRC日期
int m=21849; //开始DLRETX数据
int count=0;
int jj=10923; // 结束PERNMO数据
int kk=21846; // 结束PRC数据
int mm=32767; // 结束DLRETX数据
int count_whileLoop=0;
// 追加行
for (i,j,k,m; j<=jj,k<=kk,m<=mm; i++,j++,k++,m++)
{
fprintf(outputFile, "%s%s%s\n", lines[j], lines[k], lines[m]);
}
// 释放分配的内存
for (int i = 0; i < MAX_LINES; i++)
{
free(lines[i]);
}
free(lines);
// 关闭输出文件
fclose(outputFile);
return 0;
}
请注意,这是代码的翻译版本,不包括任何其他内容。如果您需要进一步的帮助,请随时提问。
英文:
The provided code reads lines from a text file and stores them in a dynamically allocated 2D array called lines (the pattern always repeats after line 32767). It then concatenates specific lines (lines[j], lines[k], lines[m]) and prints the result in an output file. The problem is that my input.txt file is so huge that my PC cannot handle this - 100GB. Because with malloc I allocate so much memory for a 2D array that my hard drive is full. Is there any possibility to change my code that i did not have to allocate so much memory? Thanks for any help!
#include <stdlib.h>
#include <string.h>
#define MAX_LINE_LENGTH 300 // specifies the maximum length of a line in the input file
#define MAX_LINES 500000000 // specifies the maximum number of lines of the input file
int main()
{
// Open the file
FILE *fp = fopen("input.txt", "r");
if (fp == NULL)
{
printf("Failed to open the file.\n");
return 1;
}
// Allocate memory for lines
char **lines = (char **)malloc(MAX_LINES * sizeof(char *));
if (lines == NULL)
{
printf("Failed to allocate memory for lines.\n");
fclose(fp);
return 1;
}
for (int i = 0; i < MAX_LINES; i++)
{
lines[i] = (char *)malloc(MAX_LINE_LENGTH * sizeof(char));
if (lines[i] == NULL)
{
printf("Failed to allocate memory for line %d.\n", i);
// Free previously allocated lines
for (int j = 0; j < i; j++)
{
free(lines[j]);
}
free(lines);
fclose(fp);
return 1;
}
}
// Reset file pointer to the beginning
fseek(fp, 0, SEEK_SET);
// Read lines from the file and store them in lines array
int lineIndex = 0;
char line[MAX_LINE_LENGTH];
while (fgets(line, sizeof(line), fp) != NULL)
{
// Remove newline character from the end of the line
if (line[strlen(line) - 1] == '\n')
line[strlen(line) - 1] = '\0';
strcpy(lines[lineIndex], line);
lineIndex++;
}
fclose(fp);
// Create output file
FILE *outputFile = fopen("output.txt", "w");
if (outputFile == NULL)
{
perror("Error creating output file");
// Free allocated lines
for (int i = 0; i < MAX_LINES; i++)
{
free(lines[i]);
}
free(lines);
return 1;
}
int i=1;
int j=5; //begin Permno number
int k=10927; //begin PRC date
int m=21849; //begin DLRETX data
int count=0;
int jj=10923; // end PERNMO data
int kk=21846; // end PRC data
int mm=32767; // end DLRETX data
int count_whileLoop=0;
// Append lines
for (i,j,k,m; j<=jj,k<=kk,m<=mm; i++,j++,k++,m++)
{
fprintf(outputFile, "%s%s%s\n", lines[j], lines[k], lines[m]);
}
// Free allocated memory
for (int i = 0; i < MAX_LINES; i++)
{
free(lines[i]);
}
free(lines);
// Close the output file
fclose(outputFile);
return 0;
}
答案1
得分: 2
以下是代码的翻译部分:
- 对于大文件,最好使用
mmap
而不是读取数据。 - 如果我们记住了行的起始偏移量和长度(在一个名为 "line" 的结构体数组中),就不需要移除换行符(所以文件可以映射为只读)。
- 通过这种方式,每行只使用了两个
size_t
值来表示偏移和长度。也就是说,我们不需要使用strdup/malloc
来复制实际数据。 - 我们可以将行范围放入一个结构体中(在下面的代码中称为 "segment")。我们可以拥有尽可能多的这种结构体(不仅限于问题中的 3 个)。
- 我们可以在达到最大行数后停止读取。这大大减少了空间需求(例如,在您的示例中,最大行号为 32767)。
- 使用格式
%s%s%s\n
来创建一个连接的输出行相对较慢。最好使用fputc
或fwrite
。
关于 mmap
想法的背景信息,这是我的一个回答:read line by line in the most efficient way platform specific
这是重新构想的代码,带有注释。
用于测试的话,它允许使用 -G
选项来指定生成随机输入测试文件。
英文:
A few ideas ...
-
With large files, it's better to use
mmap
instead of reading in the data. -
If we remember the starting offset of a line and its length [in an array of "line"
struct
], we don't have to remove the newline [so the file can be mapped R/O]. -
With this, each line only uses two
size_t
values for offset/length. (i.e.) We don't have tostrdup/malloc
the actual data. -
We can put line ranges into a
struct
(called a "segment" in the code below). We can have as many as we want [vs. just the 3 in the problem]. -
We can stop reading after we hit the maximum line number. This is a major reduction in space (e.g. In your example, the maximum line number is 32767).
-
Doing
fprintf
with a format of%s%s%s\n"
to create a joined output line is relatively slow. Better to usefputc
orfwrite
For some background on the mmap
idea, here is an answer of mine: read line by line in the most efficient way platform specific
Here is the reimagined code. It is annotated.
For testing, it allows a -G
option to specify the generation of a random input test file.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/stat.h>
#include <sys/mman.h>
#define sysfault(_fmt...) \
do { \
printf(_fmt); \
exit(1); \
} while (0)
#if DEBUG
#define dbgprt(_fmt...) \
printf(_fmt)
#else
#define dbgprt(_fmt...) \
do { } while (0)
#endif
// dynamic list control
typedef struct {
void *list_base; // pointer to data
size_t list_size; // sizeof(element)
size_t list_count; // # of elements used
size_t list_cap; // # of elements allocated (capacity)
} list_t;
typedef struct {
size_t line_off; // absolute offset of line start
size_t line_len; // line length (inc newline)
} line_t;
typedef struct {
size_t seg_beg; // starting line index (inclusive)
size_t seg_end; // ending line index (inclusive)
size_t seg_cur; // current line index (inclusive)
size_t seg_count; // line count
} seg_t;
int opt_a; // load _all_ data
size_t opt_G; // generate input of this many bytes
int opt_L; // maximum line width
list_t *seglist;
size_t segend; // maximum line number we have to get
const char *inpbase; // pointer to input file data
size_t inpsize; // input file size
list_t *linelist; // list of lines
// listnew -- get new list
list_t *
listnew(size_t siz)
{
list_t *list = calloc(1,sizeof(*list));
list->list_size = siz;
return list;
}
// listloc -- point to list entry
void *
listloc(list_t *list,size_t idx)
{
void *ptr = list->list_base;
ptr += idx * list->list_size;
return ptr;
}
// listadd -- point to new list entry
void *
listadd(list_t *list)
{
if (list->list_count >= list->list_cap) {
list->list_cap += 100;
list->list_base = realloc(list->list_base,
list->list_size * list->list_cap);
}
void *ptr = listloc(list,list->list_count);
list->list_count += 1;
return ptr;
}
// seginit -- initialize line number range segment
void
seginit(size_t beg,size_t end)
{
seg_t *seg = listadd(seglist);
seg->seg_beg = beg;
seg->seg_end = end;
seg->seg_count = (end + 1) + beg;
if (end > segend)
segend = end;
printf("seginit: seg_beg=%zu seg_end=%zu seg_count=%zu\n",
seg->seg_beg,seg->seg_end,seg->seg_count);
seg->seg_cur = seg->seg_beg;
}
// linegeta -- read in all input lines
void
linegeta(size_t maxlines)
{
const char *lhs = inpbase;
size_t remsize = inpsize;
const char *rhs;
size_t curlen;
if (maxlines != 0)
maxlines += 20;
printf("lineget: remsize=%zu maxlines=%zu\n",remsize,maxlines);
list_t *list = listnew(sizeof(line_t));
linelist = list;
for (; remsize > 0; remsize -= curlen) {
rhs = memchr(lhs,'\n',remsize);
if (rhs == NULL)
break;
// point past newline
++rhs;
// get length of line (inc newline)
curlen = rhs - lhs;
// remember starting offset and length of current line
line_t *line = listadd(list);
line->line_off = lhs - inpbase;
line->line_len = curlen;
// point past newline
lhs = rhs;
// stop on line limit
if (maxlines != 0) {
if (list->list_count >= maxlines)
break;
}
}
printf("lineget: list_count=%zu\n",list->list_count);
}
// segput -- output a line segment
size_t
segput(FILE *xfout,seg_t *seg)
{
size_t outlen = 0;
do {
size_t lnoidx = seg->seg_cur;
int over = (lnoidx > seg->seg_end);
dbgprt("segput: seg_cur=%zu seg_end=%zu%s\n",
seg->seg_cur,seg->seg_end,over ? " OVER" : "");
if (over)
break;
++seg->seg_cur;
const line_t *line = listloc(linelist,lnoidx);
const char *buf = inpbase + line->line_off;
size_t explen = line->line_len - 1;
outlen = fwrite(buf,1,explen,xfout);
if (outlen == explen)
break;
sysfault("segput: explen=%zu outlen=%zu -- %s\n",
explen,outlen,strerror(errno));
} while (0);
return outlen;
}
// lineput -- create an output line
int
lineput(FILE *xfout)
{
list_t *list = seglist;
int outcnt = 0;
for (int segidx = 0; segidx < list->list_count; ++segidx) {
seg_t *seg = listloc(list,segidx);
if (segput(xfout,seg))
++outcnt;
}
if (outcnt > 0)
fprintf(xfout,"\n");
return outcnt;
}
// lineputa -- output all joined lines
void
lineputa(FILE *xfout)
{
size_t outcnt = 0;
while (1) {
if (! lineput(xfout))
break;
++outcnt;
if (outcnt > linelist->list_count)
sysfault("lineputa: too much output\n");
}
}
// xrand -- fast random number generator
unsigned int
xrand(void)
{
static unsigned int seed = 1;
// this is random in TYPE_0 (linear congruential) mode with the mask
// removed to get full range numbers
// this does _not_ produce overlaps
seed = (seed * 1103515245) + 12345;
return seed;
}
// genline -- generate a line number
size_t
genline(FILE *xfdst,size_t lno)
{
char buf[1000];
char *bp = buf;
size_t off = ftell(xfdst);
bp += sprintf(bp," %zu/%zu:",off,lno);
ssize_t remlen = bp - buf;
remlen = (opt_L - 20) - remlen;
if (remlen <= 0)
remlen = 1;
remlen = (xrand() % remlen) + 1;
for (; remlen > 0; --remlen, ++bp) {
int chr = xrand() % 26;
chr += 'A';
*bp = chr;
}
*bp++ = '\n';
size_t explen = bp - buf;
size_t actlen = fwrite(buf,1,explen,xfdst);
if (actlen != explen)
sysfault("genline: fwrite lno=%zu -- %s\n",lno,strerror(errno));
return actlen;
}
// genfile -- generate random input file
void
genfile(const char *file,size_t maxsize,size_t lnomax)
{
size_t totsize = 0;
size_t curlen;
size_t lnocur = 1;
if (lnomax != 0)
lnomax += 1000;
printf("genfile: maxsize=%zu lnomax=%zu\n",maxsize,lnomax);
FILE *xfdst = fopen(file,"w");
for (; (totsize < maxsize); totsize += curlen, ++lnocur) {
if ((lnomax != 0) && (lnocur >= lnomax))
break;
curlen = genline(xfdst,lnocur);
}
fclose(xfdst);
printf("genfile: totsize=%zu lnocur=%zu\n",totsize,lnocur);
}
int
main(int argc,char **argv)
{
--argc;
++argv;
for (; argc > 0; --argc, ++argv) {
char *cp = *argv;
if (*cp != '-')
break;
cp += 2;
switch (cp[-1]) {
case 'a':
opt_a = ! opt_a;
break;
case 'G':
opt_G = (*cp != 0) ? strtol(cp,&cp,10) : 100;
switch (*cp) {
case 'M':
opt_G *= 1024 * 1024;
break;
default:
opt_G *= 1024 * 1024 * 1024;
break;
}
break;
case 'L':
opt_L = (*cp != 0) ? strtol(cp,&cp,10) : 40;
break;
}
}
if (opt_L <= 0)
opt_L = 40;
// get ranges we want
seglist = listnew(sizeof(seg_t));
seginit(5,10923); // Permno number
seginit(10927,21846); // PRC date
seginit(21849,32767); // DLRETX data
if (opt_a)
segend = 0;
if (argc != 2)
sysfault("main: need two args\n");
const char *inpfile = *argv++;
const char *outfile = *argv++;
// DEBUG: generate random test input file (careful!)
if (opt_G)
genfile(inpfile,opt_G,segend);
int fdinp = open(inpfile,O_RDONLY);
if (fdinp < 0)
sysfault("main: unable to open '%s' -- %s\n",inpfile,strerror(errno));
struct stat st;
fstat(fdinp,&st);
inpsize = st.st_size;
printf("st_size=%zu\n",inpsize);
// map the input file
void *base = mmap(NULL,inpsize,PROT_READ,MAP_SHARED,fdinp,0);
if (base == MAP_FAILED)
sysfault("main: unable to mmap -- %s\n",strerror(errno));
inpbase = base;
// find all input line offsets and lengths
linegeta(segend);
FILE *xfout = fopen(outfile,"w");
if (xfout == NULL)
sysfault("main: unable to open '%s' -- %s\n",inpfile,strerror(errno));
// output all joined lines
lineputa(xfout);
fclose(xfout);
munmap(base,st.st_size);
close(fdinp);
return 0;
}
答案2
得分: 2
以下是翻译的内容:
另一种解决方法,与原始编码风格一致。
由于您正在阅读长度未指定的行,与gets()
相比,更好的选择是getline()
(如果您使用MSVC,可以在此处找到它)。它允许您读取行而不必担心正确调整输入缓冲区的大小。
鉴于我们正在追加,这里定义了另一个有用的函数来重新分配和追加。然后事情就相当标准了:我定义了一个存储各节开始点、节长度(这样我们就不需要担心它们的大小相同)和它们数量的向量。
然后,一旦我们到达一个节,我们开始追加第一个节,然后是第二个,然后是第三个,都追加到相同的行上。输出只是将这些行写入文件。
英文:
Another take on the problem, in line with the original coding style.
Since you are reading lines of unspecified length, a better alternative to gets()
is getline()
(available here if you are using MSVC). It allows you to read lines without worry of correctly sizing the input buffer.
Given that we are appending, another useful function is defined here to realloc and append. Then things are pretty standard: I define a vector of sections starting points, the section length (so we don't need to worry that they are of the same size) and their number.
Then, as soon as we reach a section we start appending the first section, then the second, then the third, to the same lines. Output is just writing the lines to file.
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#ifdef _MSC_VER
#include "getline.h"
#endif
const int sections[] = {
5, // PERNMO
10927, // PRC
21849, // DLRETX
};
const int sections_lengths = 10919;
const int nsections = 3;
// Tries to realloc dst to allow appending src.
ssize_t strcat_realloc(char ** dst, const char *src)
{
if (dst == NULL || src == NULL) {
errno = EINVAL;
return -1;
}
size_t ndst = *dst ? strlen(*dst) : 0;
size_t nsrc = strlen(src);
char *tmp = realloc(*dst, ndst + nsrc + 1);
if (tmp == NULL) {
errno = ENOMEM;
return -1;
}
memcpy(tmp + ndst, src, nsrc + 1);
*dst = tmp;
return ndst + nsrc;
}
int main(void)
{
FILE *fp = fopen("input.txt", "r");
if (fp == NULL) {
printf("Failed to open the file.\n");
return 1;
}
char **lines = calloc(sections_lengths, sizeof(char*));
int cur = 0;
int lineIndex = 0;
char *line = NULL;
size_t len = 0;
while (getline(&line,&len, fp) != -1) {
line[strcspn(line, "\n")] = 0; // Remove newline character from the end of the line
if (lineIndex == sections[cur] + sections_lengths) {
++cur;
if (cur == nsections) {
break;
}
}
if (lineIndex >= sections[cur]) {
strcat_realloc(lines + lineIndex - sections[cur], line); // TODO: manage errors
}
lineIndex++;
}
fclose(fp);
int result = 0;
// Create output file
FILE *outputFile = fopen("output.txt", "w");
if (outputFile == NULL) {
perror("Error creating output file");
result = 1;
}
else {
// Append lines
for (int i = 0; i < sections_lengths; ++i) {
fprintf(outputFile, "%s\n", lines[i]);
}
fclose(outputFile);
}
// Free allocated memory
for (int i = 0; i < sections_lengths; ++i) {
free(lines[i]);
}
free(lines);
return result;
}
答案3
得分: 2
你只需要处理文件的前32768行,所以不需要读取更多内容。但是,你也说了:
在第32767行之后模式总是重复的
所以我假设你真正想要的是批量处理32768行。无论哪种情况,这意味着你只需要一次存储那么多行的空间,而不是一次性存储整个文件。这只有大约10MB,可以轻松放在文件范围变量中。
所以将代码更改为使用静态大小的缓冲区,消除任何动态分配,然后循环读取每次32768行并处理它们。
另外:
j<=jj,k<=kk,m<=mm
不会产生你期望的结果。你可能想要j<=jj && k<=kk && m<=mm
- 变量
jj
,kk
, 和mm
不够描述性。最好使用名称来表示各段的起始/结束点。
通过以上更改,你现在有以下代码:
#include <stdlib.h>
#include <string.h>
#define MAX_LINE_LENGTH 300 // 输入文件中一行的最大长度
#define MAX_BATCH 32768 // 输入批次中的行数
#define PERMO_START 5
#define PRC_START 10927
#define DLRETX_START 21849
#define PERMO_END 10923
#define PRC_END 21846
#define DLRETX_END 32767
char lines[MAX_BATCH][MAX_LINE_LENGTH];
int main()
{
// 打开输入文件
FILE *fp = fopen("input.txt", "r");
if (fp == NULL)
{
printf("无法打开文件。\n");
return 1;
}
// 创建输出文件
FILE *outputFile = fopen("output.txt", "w");
if (outputFile == NULL)
{
perror("创建输出文件时出错");
return 1;
}
int done = 0;
do {
memset(lines, 0, sizeof lines);
// 从文件中读取行并将它们存储在lines数组中
int lineIndex = 0;
char line[MAX_LINE_LENGTH];
while ((lineIndex < MAX_BATCH) && (fgets(line, sizeof(line), fp) != NULL))
{
// 从行末删除换行字符
if (line[strlen(line) - 1] == '\n')
line[strlen(line) - 1] = '\0';
strcpy(lines[lineIndex], line);
lineIndex++;
}
done = lineIndex < MAX_BATCH;
// 追加行
for (int j=PERMO_START, k=PRC_START, m=DLRETX_START;
j<=PERMO_END && k<=PRC_END && m<=DLRETX_END;
j++,k++,m++)
{
fprintf(outputFile, "%s%s%s\n", lines[j], lines[k], lines[m]);
}
} while (!done);
// 关闭输入和输出文件
fclose(fp);
fclose(outputFile);
return 0;
}
(Note: The code provided is a C code snippet with some modifications for improved readability, and it processes input text files in batches of 32768 lines while appending specific lines from each batch to an output file.)
英文:
You're only processing the first 32768 lines of the file, so you don't need to read any more than that. However, you also said:
> the pattern always repeats after line 32767
So I'm assuming what you really want is to process batches of 32768 lines. In either case, that means you only need space to store that many lines at a time, not the whole file all at once. That's only about 10MB, which can easily be set aside in a file scope variable.
So change the code to use a static buffer of that size, eliminating any dynamic allocation, then loop through reading 32768 lines at a time and processing them.
Also:
j<=jj,k<=kk,m<=mm
doesn't do what you expect. You probably wantj<=jj && k<=kk && m<=mm
- The variables
jj
,kk
, andmm
are not very descriptive. Better to use names for the section start/end points.
With the above changes, you now have the following:
#include <stdlib.h>
#include <string.h>
#define MAX_LINE_LENGTH 300 // maximum length of a line in the input file
#define MAX_BATCH 32768 // the number of lines in an input batch
#define PERMO_START 5
#define PRC_START 10927
#define DLRETX_START 21849
#define PERMO_END 10923
#define PRC_END 21846
#define DLRETX_END 32767
char lines[MAX_BATCH][MAX_LINE_LENGTH];
int main()
{
// Open the input file
FILE *fp = fopen("input.txt", "r");
if (fp == NULL)
{
printf("Failed to open the file.\n");
return 1;
}
// Create output file
FILE *outputFile = fopen("output.txt", "w");
if (outputFile == NULL)
{
perror("Error creating output file");
return 1;
}
int done = 0;
do {
memset(lines, 0, sizeof lines);
// Read lines from the file and store them in lines array
int lineIndex = 0;
char line[MAX_LINE_LENGTH];
while ((lineIndex < MAX_BATCH) && (fgets(line, sizeof(line), fp) != NULL))
{
// Remove newline character from the end of the line
if (line[strlen(line) - 1] == '\n')
line[strlen(line) - 1] = '\0';
strcpy(lines[lineIndex], line);
lineIndex++;
}
done = lineIndex < MAX_BATCH;
// Append lines
for (int j=PERMO_START, k=PRC_START, m=DLRETX_START;
j<=PERMO_END && k<=PRC_END && m<=DLRETX_END;
j++,k++,m++)
{
fprintf(outputFile, "%s%s%s\n", lines[j], lines[k], lines[m]);
}
} while (!done);
// Close the input and output file
fclose(fp);
fclose(outputFile);
return 0;
}
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论