英文:
Golang stdin reads german umlauts wrong
问题
我来帮你翻译一下:
我来自德国,所以我使用像ä
、ö
和ü
这样的umlauts。然而,Golang无法正确地从标准输入读取它们。
当我执行这个简单的程序时:
package main
import (
"bufio"
"fmt"
"os"
)
func main() {
for {
b, _, _ := bufio.NewReader(os.Stdin).ReadLine()
printBytes(b)
}
}
func printBytes(bytes []byte) {
for _, b := range bytes {
fmt.Printf("0x%X ", b)
}
fmt.Println()
}
我得到的输出是:
C:\dev\golang>go run test.go
ä
0xE2 0x80 0x9E
E2 80 9E
不是UTF-8中ä
的正确字节序列(这个工具告诉我它是一个“DOUBLE LOW-9 QUOTATION MARK” -> „
),当我打印出我读取的内容时,它打印出了"
。我写了一个小的“hack”,似乎可以正确地读取字符:
package main
/*
#include <stdio.h>
#include <stdlib.h>
char * getline(void) {
char * line = malloc(100), * linep = line;
size_t lenmax = 100, len = lenmax;
int c;
if(line == NULL)
return NULL;
for(;;) {
c = fgetc(stdin);
if(c == EOF)
break;
if(--len == 0) {
len = lenmax;
char * linen = realloc(linep, lenmax *= 2);
if(linen == NULL) {
free(linep);
return NULL;
}
line = linen + (line - linep);
linep = linen;
}
if((*line++ = c) == '\n')
break;
}
*line = 'package main
/*
#include <stdio.h>
#include <stdlib.h>
char * getline(void) {
char * line = malloc(100), * linep = line;
size_t lenmax = 100, len = lenmax;
int c;
if(line == NULL)
return NULL;
for(;;) {
c = fgetc(stdin);
if(c == EOF)
break;
if(--len == 0) {
len = lenmax;
char * linen = realloc(linep, lenmax *= 2);
if(linen == NULL) {
free(linep);
return NULL;
}
line = linen + (line - linep);
linep = linen;
}
if((*line++ = c) == '\n')
break;
}
*line = '\0';
return linep;
}
void freeline(char* ptr) {
free(ptr);
}
*/
import "C"
import (
"fmt"
"golang.org/x/text/encoding/charmap"
)
func getLineFromCp850() string {
line := C.getline()
goline := C.GoString(line)
C.freeline(line)
b := []byte(goline)
ub, _ := charmap.CodePage850.NewDecoder().Bytes(b)
return string(ub)
}
func main() {
for {
line := getLineFromCp850()
printBytes([]byte(line))
}
}
func printBytes(bytes []byte) {
for _, b := range bytes {
fmt.Printf("0x%X ", b)
}
fmt.Println()
}
';
return linep;
}
void freeline(char* ptr) {
free(ptr);
}
*/
import "C"
import (
"fmt"
"golang.org/x/text/encoding/charmap"
)
func getLineFromCp850() string {
line := C.getline()
goline := C.GoString(line)
C.freeline(line)
b := []byte(goline)
ub, _ := charmap.CodePage850.NewDecoder().Bytes(b)
return string(ub)
}
func main() {
for {
line := getLineFromCp850()
printBytes([]byte(line))
}
}
func printBytes(bytes []byte) {
for _, b := range bytes {
fmt.Printf("0x%X ", b)
}
fmt.Println()
}
它打印出:
C:\dev\golang>go run test.go
ä
0xC3 0xA4 0xA
C3 A4
是ä
的正确字节序列(0A是换行符,我的hack没有去掉),所以看起来,从CP850读取并转换为UTF-8可以解决问题,正如我所预期的那样,但为什么Go在使用Go的功能读取行时给我一些无意义的值呢?Go有什么问题,它不将输入字节解释为CP850而是其他字符集吗?有没有更好的仅使用Go的方法来处理这个问题?
这个问题只在从标准输入读取时出现。当我将一个UTF-8的ä
打印到标准输出时,它在控制台上正确打印出来。
英文:
I'm from germany so I use umlauts like ä
, ö
and ü
. Golang however doesn't read them correctly from stdin.
When I execute this simple program:
package main
import (
"bufio"
"fmt"
"os"
)
func main() {
for {
b, _, _ := bufio.NewReader(os.Stdin).ReadLine()
printBytes(b)
}
}
func printBytes(bytes []byte) {
for _, b := range bytes {
fmt.Printf("0x%X ", b)
}
fmt.Println()
}
I get the output:
C:\dev\golang>go run test.go
ä
0xE2 0x80 0x9E
E2 80 9E
isn't the correct byte sequence for the ä
in UTF-8 (this tool tells me it's an "DOUBLE LOW-9 QUOTATION MARK" -> „
) and when I just print out what I've read it prints "
. I've written a small "hack" which seems to read the characters correct:
package main
/*
#include <stdio.h>
#include <stdlib.h>
char * getline(void) {
char * line = malloc(100), * linep = line;
size_t lenmax = 100, len = lenmax;
int c;
if(line == NULL)
return NULL;
for(;;) {
c = fgetc(stdin);
if(c == EOF)
break;
if(--len == 0) {
len = lenmax;
char * linen = realloc(linep, lenmax *= 2);
if(linen == NULL) {
free(linep);
return NULL;
}
line = linen + (line - linep);
linep = linen;
}
if((*line++ = c) == '\n')
break;
}
*line = 'package main
/*
#include <stdio.h>
#include <stdlib.h>
char * getline(void) {
char * line = malloc(100), * linep = line;
size_t lenmax = 100, len = lenmax;
int c;
if(line == NULL)
return NULL;
for(;;) {
c = fgetc(stdin);
if(c == EOF)
break;
if(--len == 0) {
len = lenmax;
char * linen = realloc(linep, lenmax *= 2);
if(linen == NULL) {
free(linep);
return NULL;
}
line = linen + (line - linep);
linep = linen;
}
if((*line++ = c) == '\n')
break;
}
*line = '\0';
return linep;
}
void freeline(char* ptr) {
free(ptr);
}
*/
import "C"
import (
"fmt"
"golang.org/x/text/encoding/charmap"
)
func getLineFromCp850() string {
line := C.getline()
goline := C.GoString(line)
C.freeline(line)
b := []byte(goline)
ub, _ := charmap.CodePage850.NewDecoder().Bytes(b)
return string(ub)
}
func main() {
for {
line := getLineFromCp850()
printBytes([]byte(line))
}
}
func printBytes(bytes []byte) {
for _, b := range bytes {
fmt.Printf("0x%X ", b)
}
fmt.Println()
}
';
return linep;
}
void freeline(char* ptr) {
free(ptr);
}
*/
import "C"
import (
"fmt"
"golang.org/x/text/encoding/charmap"
)
func getLineFromCp850() string {
line := C.getline()
goline := C.GoString(line)
C.freeline(line)
b := []byte(goline)
ub, _ := charmap.CodePage850.NewDecoder().Bytes(b)
return string(ub)
}
func main() {
for {
line := getLineFromCp850()
printBytes([]byte(line))
}
}
func printBytes(bytes []byte) {
for _, b := range bytes {
fmt.Printf("0x%X ", b)
}
fmt.Println()
}
And it prints out:
C:\dev\golang>go run test.go
ä
0xC3 0xA4 0xA
C3 A4
is the correct bytesequence for the ä
(0A is the linefeed which my hack doesn't strip) so it seems like, reading and converting from CP850 to UTF-8 does the job, as I expected, but why does Go give me gibberish when I read the line using Go's functionality instead of cgo? Whats wrong with Go that it gives me those values, doesn't it interpret the input bytes as CP850 but another charset? Is there a better Go-only way to handle this problem?
This problem only arises when reading from stdin. When I print out a UTF-8 ä
to stdout it prints correctly in the console.
答案1
得分: 2
所以这是Golang在某些系统上的一个错误,具体来说是针对Windows系统,其中整体使用的字符集和控制台字符集不同(WinAPI的GetACP()
和GetConsoleCP()
返回不同的结果)。例如,在德国(和其他西欧国家),Windows使用代码页1252作为整体字符集,但在控制台cmd.exe
中使用代码页850。不确定为什么,但事实就是如此。Golang错误地使用GetACP()
将输入解码为UTF-8,而实际上应该使用GetConsoleCP()
返回的代码页。我们在我创建的问题中找到了这个问题,希望在下一个版本的Golang中能够看到修复。
我们还在Windows上发现了一个问题,即Golang将字符解码为“分解的UTF-8”字符(即它会将ä
读取为字符a
后跟着COMBINING DIAERESIS ̈
),这可能会导致其他问题,例如打印这些分解的字符时会将它们分开而不是组合成一个字符。
英文:
So it was a bug in Golang for some systems, to be specific for Windows systems where the overall used charset and the console charset were different (Where GetACP()
and GetConsoleCP()
from WinAPI returned different things). In Germany, for example, (and maybe other west-european countries), Windows uses the codepage 1252 as the overall-charset but it uses codepage 850 for the console cmd.exe
. Not sure why, but thats how it is. Golang wrongly used GetACP()
to decode the input to UTF-8 when it really should've used the codepage returned by GetConsoleCP()
. We found the problem in the Issue I created and we'll hopefully see the fix merged for the next version of Golang.
We also found a problem on Windows where Golang decoded characters to decomposed UTF-8 characters (i.e. it would read a ä
to the character a
followed by the COMBINING DIAERESIS ̈
) which could lead to other problems, for example printing those decomposed characters prints them separate instead of one combined character.
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论