在C中读写Unicode字符到文件。

huangapple go评论52阅读模式
英文:

Reading and writing unicode characters to file in C

问题

以下是您的代码的中文翻译:

我正在尝试创建一个程序,将Unicode字符保存在文本文件中(文件名为data.txt),当我再次加载文件时,它们将在终端中正确打印出来。这是我的代码:

#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
#define ON_WINDOWS
#endif

#ifdef ON_WINDOWS
#define _CRT_SECURE_NO_WARNINGS
#include <io.h>     // _setmode
#include <fcntl.h>  // _O_U16TEXT

// 以防mingw最终没有定义它:
#ifndef _O_U16TEXT
#define _O_U16TEXT (0x20000)
#endif
#endif

#include <stdio.h>
#include <locale.h>
#include <wchar.h>
#define SIZE 100

void set_locale_mode() {
#ifdef ON_WINDOWS
   // Unicode UTF-16,小字节序(ISO 10646的BMP)
   const char* CP_UTF_16LE = ".1200";

   setlocale(LC_ALL, CP_UTF_16LE);
   _setmode(_fileno(stdin), _O_U16TEXT);
   _setmode(_fileno(stdout), _O_U16TEXT);
#else
   setlocale(LC_ALL, "");
#endif
}

int main(void) {

   set_locale_mode();

   wchar_t myString[SIZE];
   wchar_t loadedString[SIZE];

   wprintf(L"输入3个字符:");
   wscanf(L"%ls", myString);
   wprintf(L"您的输入是%ls\n", myString);

   FILE *pFile;

   if(pFile=fopen("data.txt", "w")) {
      fwprintf(pFile, L"%ls", myString);
   } else {
      wprintf(L"无法写入文件!\n");
   }

   fclose(pFile);

   if(pFile=fopen("data.txt", "r")) {
      fwscanf(pFile, L"%ls", loadedString);
   } else {
    wprintf(L"无法从文件中读取!\n");
   } 

   fclose(pFile);

   wprintf(L"加载的字符串是%ls", loadedString);

   return 0;
}

对于您的问题,第三次尝试中出现问题的原因是文件的字符编码格式。您在第三次尝试中复制了一些中文字符并粘贴到终端,但文件的字符编码格式可能与终端不匹配,导致乱码。

要解决此问题,您可以尝试在文件操作中指定正确的字符编码格式,例如UTF-8。这将确保文件中的Unicode字符以正确的方式保存和加载。

英文:

I am trying to create a program that saves unicode characters in a text file (named data.txt below) and when I load the file again they will be correctly printed in the terminal. This is my code:

#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
#define ON_WINDOWS
#endif

#ifdef ON_WINDOWS
#define _CRT_SECURE_NO_WARNINGS
#include &lt;io.h&gt;     // _setmode
#include &lt;fcntl.h&gt;  // _O_U16TEXT

// just in case mingw doesn&#39;t define it after all:
#ifndef _O_U16TEXT
#define _O_U16TEXT (0x20000)
#endif
#endif

#include &lt;stdio.h&gt;
#include &lt;locale.h&gt;
#include &lt;wchar.h&gt;
#define SIZE 100

void set_locale_mode() {
#ifdef ON_WINDOWS
   // Unicode UTF-16, little endian byte order (BMP of ISO 10646)
   const char* CP_UTF_16LE = &quot;.1200&quot;;

   setlocale(LC_ALL, CP_UTF_16LE);
   _setmode(_fileno(stdin), _O_U16TEXT);
   _setmode(_fileno(stdout), _O_U16TEXT);
#else
   setlocale(LC_ALL, &quot;&quot;);
#endif
}

int main(void) {

   set_locale_mode();

   wchar_t myString[SIZE];
   wchar_t loadedString[SIZE];

   wprintf(L&quot;Enter 3 characters: &quot;);
   wscanf(L&quot;%ls&quot;, myString);
   wprintf(L&quot;Your input is %ls\n&quot;, myString);

   FILE *pFile;

   if(pFile=fopen(&quot;data.txt&quot;, &quot;w&quot;)) {
      fwprintf(pFile, L&quot;%ls&quot;, myString);
   } else {
      wprintf(L&quot;Failed to write to file!\n&quot;);
   }

   fclose(pFile);

   if(pFile=fopen(&quot;data.txt&quot;, &quot;r&quot;)) {
      fwscanf(pFile, L&quot;%ls&quot;, loadedString);
   } else {
    wprintf(L&quot;Failed to read from file!\n&quot;);
   } 

   fclose(pFile);

   wprintf(L&quot;Loaded string is %ls&quot;, loadedString);

   return 0;
}

The following 2 examples work as intended:

First one:

Enter 3 characters: abc
Your input is abc
Loaded string is abc

Second one:

Enter 3 characters: &#229;&#228;&#246;
Your input is &#229;&#228;&#246;
Loaded string is &#229;&#228;&#246;

However, in the third attempt, I copied some chinese characters and copied them into the terminal and this was the outcome:

Enter 3 characters: 买买买
Your input is 买买买
Loaded string is 炨}ﺨa眬☺

The loaded string is obviously not what I intended. Why does it not work and how do I fix it?

Edit: I am on Windows and uses MINGW32 and uses VSCode.

答案1

得分: 1

在这种情况下,由于您打开了两个新流,您也需要将它们设置为正确的模式:

if (pFile = fopen("data.txt", "w")) {
#ifdef ON_WINDOWS
    _setmode(_fileno(pFile), _O_U16TEXT);      // &lt;- 这里
#endif
    fwprintf(pFile, L"%ls", myString);
} else {
    wprintf(L"无法写入文件!\n");
}

fclose(pFile);

if (pFile = fopen("data.txt", "r")) {
#ifdef ON_WINDOWS
    _setmode(_fileno(pFile), _O_U16TEXT);      // &lt;- 这里
#endif
    fwscanf(pFile, L"%99ls", loadedString);
} else {
    wprintf(L"无法从文件中读取!\n");
}

还请注意L"%99ls"中的99。如果您使用scanf系列函数从流中读取单词,请_始终_将要读取的字符的最大数量设置为缓冲区大小减1。


在Windows上,您还可以直接以正确的模式UTF-16LE打开文件:

#ifdef ON_WINDOWS
if (pFile = fopen("data.txt", "w, ccs=UTF-16LE")) { // &lt;- 这里
#else
if (pFile = fopen("data.txt", "w")) {
#endif
    fwprintf(pFile, L"%ls", myString);
} else {
    wprintf(L"无法写入文件!\n");
}

fclose(pFile);

#ifdef ON_WINDOWS
if (pFile = fopen("data.txt", "r, ccs=UTF-16LE")) { // &lt;- 这里
#else
if (pFile = fopen("data.txt", "r")) {
#endif
    fwscanf(pFile, L"%99ls", loadedString);
} else {
    wprintf(L"无法从文件中读取!\n");
}
英文:

In this case, since you open two new streams, you need to set them in the proper mode too:

if (pFile = fopen(&quot;data.txt&quot;, &quot;w&quot;)) {
#ifdef ON_WINDOWS
    _setmode(_fileno(pFile), _O_U16TEXT);      // &lt;- here
#endif
    fwprintf(pFile, L&quot;%ls&quot;, myString);
} else {
    wprintf(L&quot;Failed to write to file!\n&quot;);
}

fclose(pFile);

if (pFile = fopen(&quot;data.txt&quot;, &quot;r&quot;)) {
#ifdef ON_WINDOWS
    _setmode(_fileno(pFile), _O_U16TEXT);      // &lt;- here
#endif
    fwscanf(pFile, L&quot;%99ls&quot;, loadedString);
} else {
    wprintf(L&quot;Failed to read from file!\n&quot;);
}

Also note the 99 in the L&quot;%99ls&quot;. If you use the scanf family of functions to read words from streams, always set the max number of characters to read at most one less than the size of the buffer.


On Windows, you can also open the files with the correct mode, UTF-16LE, directly:

#ifdef ON_WINDOWS
if (pFile = fopen(&quot;data.txt&quot;, &quot;w, ccs=UTF-16LE&quot;)) { // &lt;- here
#else
if (pFile = fopen(&quot;data.txt&quot;, &quot;w&quot;)) {
#endif
    fwprintf(pFile, L&quot;%ls&quot;, myString);
} else {
    wprintf(L&quot;Failed to write to file!\n&quot;);
}

fclose(pFile);

#ifdef ON_WINDOWS
if (pFile = fopen(&quot;data.txt&quot;, &quot;r, ccs=UTF-16LE&quot;)) { // &lt;- here
#else
if (pFile = fopen(&quot;data.txt&quot;, &quot;r&quot;)) {
#endif
    fwscanf(pFile, L&quot;%99ls&quot;, loadedString);
} else {
    wprintf(L&quot;Failed to read from file!\n&quot;);
}

答案2

得分: 1

对于程序的目的,字符处理有三个重要的定位:

  • 终端
  • 程序
  • 文件

每一个都会有一个字符编码,用于将字符表示为数字,并且一组字符可以被识别。这些对于不同的定位可能并不相同,但如果它们相同会很方便。

如果要从终端捕获字符数据并最终将其回显到终端,则了解程序如何处理它对终端使用的编码是很重要的,而这是程序不能控制的。特别是,程序设置其区域设置不会影响终端以何种形式提供字符数据给它,但它会影响程序如何解释来自终端的数据。如果终端和您的程序在这方面有分歧,那么很可能会导致数据损坏。

另一方面,您的特定程序不需要关心终端使用的具体编码 - 它只需要存储接收到的数据,然后准确地将其回显。

如果终端未以宽字符模式运行,则尝试使用宽定向I/O函数从中读取或写入可能会使您陷入困境。如今,宽字符终端模式已经不太常见,因为基于UTF-8的模式,它们是以字节为导向的,几乎普遍可用,并且对于大多数目的来说更方便。如果您可以依赖终端以字节为导向的模式运行(特定模式并不重要),那么您可能已经超越了正常的做法。在这种情况下,您可能不需要做任何特别的事情 - 终端可以处理的Unicode字符应该可以正常工作:

#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
#define _CRT_SECURE_NO_WARNINGS
#endif

#include <stdio.h>
#define SIZE 100

int main(void) {
   char myString[SIZE];
   char loadedString[SIZE];

   printf("Enter 3 characters: ");
   scanf("%99s", myString);
   printf("Your input is %s\n", myString);

   FILE *pFile;

   if (pFile = fopen("data.txt", "w")) {
      fprintf(pFile, "%s", myString);
   } else {
      printf("Failed to open file for writing!\n");
   }
   fclose(pFile);

   if (pFile = fopen("data.txt", "r")) {
      fscanf(pFile, "%s", loadedString);
   } else {
      printf("Failed to open file for reading!\n");
   } 
   fclose(pFile);

   printf("Loaded string is %s", loadedString);

   return 0;
}

如果终端以宽字符模式运行,那么最主要的变化就是宽字符流可能包含许多单独的空字节,这些字节在普通C字符串中会被解释为字符串终止符。这是上述方法在与此类终端模式一起使用时不会被预期能正常工作的唯一原因。

另一方面,如果您的终端以宽字符模式运行,那么您最简单的前进方式可能就是全面切换到宽字符串和宽I/O,但没有理由改变区域设置或修改流模式。

如果您想处理宽字符和窄字符终端模式,那么您应该检查而不是设置区域设置,以尝试确定您正在处理哪种模式。然后,您将使用它来在窄样式和宽样式操作之间进行选择。

英文:

For the program's purposes, there are three important locuses for character handling:

  • the terminal
  • the program
  • the file

Each one will have a character encoding by which it represents characters as numbers, and a set of characters that they recognize. These are not necessarily the same for the various locuses, but it's convenient if they are.

If the point is to capture character data from the terminal and ultimately to echo it back to the terminal, then it's important to understand that how the program must handle that is sensitive to the encoding the terminal is using, and that that is outside the program's control. In particular, the program setting its locale does not influence the form in which the terminal will deliver character data to it, but it does influence how your program interprets the data from the terminal. If the terminal and your program disagree about that then data corruption is a likely result.

On the other hand, your particular program doesn't need to be concerned with what specific encoding the terminal is using -- it just needs to store the data it receives and then echo it back accurately.

If the terminal is not operating in a wide-character mode, then trying to read from it or write to it with wide-oriented I/O functions is likely to mess you up. And wide-character terminal modes are unusual these days, because UTF-8-based modes, which are byte-oriented, are pretty much universally available, and are more convenient to work with for most purposes. If you can rely on the terminal running in a byte-oriented mode (it doesn't matter which in particular) then you're going way out of your way. In that case, you probably don't need to do anything special -- Unicode characters your terminal handles at all should just work:

#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
#define _CRT_SECURE_NO_WARNINGS
#endif

#include &lt;stdio.h&gt;
#define SIZE 100

int main(void) {
   char myString[SIZE];
   char loadedString[SIZE];

   printf(&quot;Enter 3 characters: &quot;);
   scanf(&quot;%99s&quot;, myString);
   printf(&quot;Your input is %s\n&quot;, myString);

   FILE *pFile;

   if (pFile = fopen(&quot;data.txt&quot;, &quot;w&quot;)) {
      fprintf(pFile, &quot;%s&quot;, myString);
   } else {
      printf(&quot;Failed to open file for writing!\n&quot;);
   }
   fclose(pFile);

   if (pFile = fopen(&quot;data.txt&quot;, &quot;r&quot;)) {
      fscanf(pFile, &quot;%s&quot;, loadedString);
   } else {
      printf(&quot;Failed to open file for reading!\n&quot;);
   } 
   fclose(pFile);

   printf(&quot;Loaded string is %s&quot;, loadedString);

   return 0;
}

The main thing that changes if your terminal is operating in a wide-character mode is that the wide-character streams are likely contain a lot of individual nul bytes, which would be interpreted as string terminators in an ordinary C string. That's the only reason why the approach above would not be expected to work in conjunction with such a terminal mode.

If your terminal is running in a wide-character mode, on the other hand, then your easiest way forward is probably to just change to wide strings and wide I/O universally, but there should be no reason to change locale or modify stream modes.

If you want to handle both wide and narrow terminal modes then you would want to check, not set, the locale to try to figure out which you're dealing with. You would then use that to choose between narrow style and wide style operation.

huangapple
  • 本文由 发表于 2023年6月30日 02:23:27
  • 转载请务必保留本文链接:https://go.coder-hub.com/76583723.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定