如何在golang中将utf16文本文件读取为字符串?

huangapple go评论91阅读模式
英文:

How to read utf16 text file to string in golang?

问题

我可以将文件读取为字节数组

但是当我将其转换为字符串时

它将utf16字节视为ascii

如何正确转换它?

package main

import (
	"fmt"
	"os"
	"bufio"
)

func main(){
	// 读取整个文件
	f, err := os.Open("test.txt")
	if err != nil {
		fmt.Printf("打开文件时出错:%v\n",err)
		os.Exit(1)
	}
	r := bufio.NewReader(f)
	var s,b,e = r.ReadLine()
	if e==nil{
		fmt.Println(b)
		fmt.Println(s)
		fmt.Println(string(s))
	}
}
英文:

I can read the file to bytes array

but when I convert it to string

it treat the utf16 bytes as ascii

How to convert it correctly?

package main

import ("fmt"
"os"
"bufio"
)

func main(){
	// read whole the file
	f, err := os.Open("test.txt")
	if err != nil {
		fmt.Printf("error opening file: %v\n",err)
		os.Exit(1)
	}
	r := bufio.NewReader(f)
	var s,b,e = r.ReadLine()
	if e==nil{
		fmt.Println(b)
		fmt.Println(s)
		fmt.Println(string(s))
	}
}

output:

false

[255 254 91 0 83 0 99 0 114 0 105 0 112 0 116 0 32 0 73 0 110 0 102 0 111 0 93 0
13 0]

S c r i p t I n f o ]


Update:

After I tested the two examples, I have understanded what is the exact problem now.

In windows, if I add the line break (CR+LF) at the end of the line, the CR will be read in the line. Because the readline function cannot handle unicode correctly ([OD OA]=ok, [OD 00 OA 00]=not ok).

If the readline function can recognize unicode, it should understand [OD 00 OA 00] and return []uint16 rather than []bytes.

So I think I should not use bufio.NewReader as it is not able to read utf16, I don't see bufio.NewReader.ReadLine can accept parameter as flag to indicate the reading text is utf8, utf16le/be or utf32. Is there any readline function for unicode text in go library?

答案1

得分: 20

最新版本的golang.org/x/text/encoding/unicode使得这个过程更加容易,因为它包含了unicode.BOMOverride,它会智能地解释BOM。

这是ReadFileUTF16()函数,类似于os.ReadFile()但解码UTF-16。

package main

import (
	"bytes"
	"fmt"
	"io/ioutil"
	"log"
	"strings"

	"golang.org/x/text/encoding/unicode"
	"golang.org/x/text/transform"
)

// 类似于ioutil.ReadFile()但解码UTF-16。在从生成UTF-16BE文件的MS-Windows系统读取数据时很有用,但如果找到其他BOM,则会做正确的事情。
func ReadFileUTF16(filename string) ([]byte, error) {

	// 将文件读入[]byte:
	raw, err := ioutil.ReadFile(filename)
	if err != nil {
		return nil, err
	}

	// 创建一个将MS-Win默认转换为UTF8的转换器:
	win16be := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
	// 创建一个类似于win16be的转换器,但遵守BOM:
	utf16bom := unicode.BOMOverride(win16be.NewDecoder())

	// 创建一个使用utf16bom的Reader:
	unicodeReader := transform.NewReader(bytes.NewReader(raw), utf16bom)

	// 解码并打印:
	decoded, err := ioutil.ReadAll(unicodeReader)
	return decoded, err
}

func main() {
	data, err := ReadFileUTF16("inputfile.txt")
	if err != nil {
		log.Fatal(err)
	}
	final := strings.Replace(string(data), "\r\n", "\n", -1)
	fmt.Println(final)

}

这是NewScannerUTF16函数,类似于os.Open()但返回一个scanner。

package main

import (
	"bufio"
	"fmt"
	"log"
	"os"

	"golang.org/x/text/encoding/unicode"
	"golang.org/x/text/transform"
)

type utfScanner interface {
	Read(p []byte) (n int, err error)
}

// 创建一个类似于os.Open()但解码UTF-16的scanner。在从生成UTF-16BE文件的MS-Windows系统读取数据时很有用,但如果找到其他BOM,则会做正确的事情。
func NewScannerUTF16(filename string) (utfScanner, error) {

	// 将文件读入[]byte:
	file, err := os.Open(filename)
	if err != nil {
		return nil, err
	}

	// 创建一个将MS-Win默认转换为UTF8的转换器:
	win16be := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
	// 创建一个类似于win16be的转换器,但遵守BOM:
	utf16bom := unicode.BOMOverride(win16be.NewDecoder())

	// 创建一个使用utf16bom的Reader:
	unicodeReader := transform.NewReader(file, utf16bom)
	return unicodeReader, nil
}

func main() {

	s, err := NewScannerUTF16("inputfile.txt")
	if err != nil {
		log.Fatal(err)
	}

	scanner := bufio.NewScanner(s)
	for scanner.Scan() {
		fmt.Println(scanner.Text()) // Println会添加最后的'\n'
	}
	if err := scanner.Err(); err != nil {
		fmt.Fprintln(os.Stderr, "reading inputfile:", err)
	}

}

FYI:我已经将这些函数放入了一个开源模块中,并进行了进一步的改进。请参阅https://github.com/TomOnTime/utfutil/。

英文:

The latest version of golang.org/x/text/encoding/unicode makes it easier to do this because it includes unicode.BOMOverride, which will intelligently interpret the BOM.

Here is ReadFileUTF16(), which is like os.ReadFile() but decodes UTF-16.

<!-- language: lang-go -->

package main

import (
	&quot;bytes&quot;
	&quot;fmt&quot;
	&quot;io/ioutil&quot;
	&quot;log&quot;
	&quot;strings&quot;

	&quot;golang.org/x/text/encoding/unicode&quot;
	&quot;golang.org/x/text/transform&quot;
)

// Similar to ioutil.ReadFile() but decodes UTF-16.  Useful when
// reading data from MS-Windows systems that generate UTF-16BE files,
// but will do the right thing if other BOMs are found.
func ReadFileUTF16(filename string) ([]byte, error) {

	// Read the file into a []byte:
	raw, err := ioutil.ReadFile(filename)
	if err != nil {
		return nil, err
	}

	// Make an tranformer that converts MS-Win default to UTF8:
	win16be := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
	// Make a transformer that is like win16be, but abides by BOM:
	utf16bom := unicode.BOMOverride(win16be.NewDecoder())

	// Make a Reader that uses utf16bom:
	unicodeReader := transform.NewReader(bytes.NewReader(raw), utf16bom)

	// decode and print:
	decoded, err := ioutil.ReadAll(unicodeReader)
	return decoded, err
}

func main() {
	data, err := ReadFileUTF16(&quot;inputfile.txt&quot;)
	if err != nil {
		log.Fatal(err)
	}
	final := strings.Replace(string(data), &quot;\r\n&quot;, &quot;\n&quot;, -1)
	fmt.Println(final)

}

Here is NewScannerUTF16 which is like os.Open() but returns a scanner.

<!-- language: lang-go -->

package main

import (
	&quot;bufio&quot;
	&quot;fmt&quot;
	&quot;log&quot;
	&quot;os&quot;

	&quot;golang.org/x/text/encoding/unicode&quot;
	&quot;golang.org/x/text/transform&quot;
)

type utfScanner interface {
	Read(p []byte) (n int, err error)
}

// Creates a scanner similar to os.Open() but decodes the file as UTF-16.
// Useful when reading data from MS-Windows systems that generate UTF-16BE
// files, but will do the right thing if other BOMs are found.
func NewScannerUTF16(filename string) (utfScanner, error) {

	// Read the file into a []byte:
	file, err := os.Open(filename)
	if err != nil {
		return nil, err
	}

	// Make an tranformer that converts MS-Win default to UTF8:
	win16be := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
	// Make a transformer that is like win16be, but abides by BOM:
	utf16bom := unicode.BOMOverride(win16be.NewDecoder())

	// Make a Reader that uses utf16bom:
	unicodeReader := transform.NewReader(file, utf16bom)
	return unicodeReader, nil
}

func main() {

	s, err := NewScannerUTF16(&quot;inputfile.txt&quot;)
	if err != nil {
		log.Fatal(err)
	}

	scanner := bufio.NewScanner(s)
	for scanner.Scan() {
		fmt.Println(scanner.Text()) // Println will add back the final &#39;\n&#39;
	}
	if err := scanner.Err(); err != nil {
		fmt.Fprintln(os.Stderr, &quot;reading inputfile:&quot;, err)
	}

}

FYI: I have put these functions into an open source module and have made further improvements. See https://github.com/TomOnTime/utfutil/

答案2

得分: 14

UTF16,UTF8和字节顺序标记由Unicode Consortium定义:UTF-16 FAQUTF-8 FAQ字节顺序标记(BOM)FAQ


> 问题4802:bufio:读取行太麻烦
>
> 在Go中从文件中读取行太麻烦了。
>
> 人们通常会被bufio.Reader.ReadLine的名称所吸引,
> 但它有一个奇怪的签名,返回(line []byte,isPrefix bool,
> err error),并且需要很多工作。
>
> ReadSlice和ReadString需要一个分隔符字节,几乎
> 总是明显且不美观的'\n',并且还可以返回一行
> 和一个EOF


> 修订:f685026a2d38
>
> bufio:新的Scanner接口
>
> 基于一个名为Scanner的新类型,添加了一个新的简单接口来扫描(可能是文本)数据。
> 它具有自己的内部缓冲区,因此即使没有注入bufio.Reader,也应该是有效的。
> 输入的格式由“split函数”定义,默认情况下分割为行。


> go1.1beta1发布
>
> 您可以从常规位置下载二进制和源代码分发:
> https://code.google.com/p/go/downloads/list?q=go1.1beta1


这是一个使用Unicode规则将UTF16文本文件行转换为Go UTF8编码字符串的程序。该代码已经修订以利用Go 1.1中的新的bufio.Scanner接口。

package main

import (
	&quot;bufio&quot;
	&quot;bytes&quot;
	&quot;encoding/binary&quot;
	&quot;fmt&quot;
	&quot;os&quot;
	&quot;runtime&quot;
	&quot;unicode/utf16&quot;
	&quot;unicode/utf8&quot;
)

// UTF16BytesToString将UTF-16编码的字节(大端或小端字节顺序)转换为UTF-8编码的字符串。
func UTF16BytesToString(b []byte, o binary.ByteOrder) string {
	utf := make([]uint16, (len(b)+(2-1))/2)
	for i := 0; i+(2-1) &lt; len(b); i += 2 {
		utf[i/2] = o.Uint16(b[i:])
	}
	if len(b)/2 &lt; len(utf) {
		utf[len(utf)-1] = utf8.RuneError
	}
	return string(utf16.Decode(utf))
}

// UTF-16字节顺序
const (
	unknownEndian = iota
	bigEndian
	littleEndian
)

// dropCREndian从字节顺序数据中删除终端\r。
func dropCREndian(data []byte, t1, t2 byte) []byte {
	if len(data) &gt; 1 {
		if data[len(data)-2] == t1 &amp;&amp; data[len(data)-1] == t2 {
			return data[0 : len(data)-2]
		}
	}
	return data
}

// dropCRBE从大端数据中删除终端\r。
func dropCRBE(data []byte) []byte {
	return dropCREndian(data, &#39;\x00&#39;, &#39;\r&#39;)
}

// dropCRLE从小端数据中删除终端\r。
func dropCRLE(data []byte) []byte {
	return dropCREndian(data, &#39;\r&#39;, &#39;\x00&#39;)
}

// dropCR从数据中删除终端\r。
func dropCR(data []byte) ([]byte, int) {
	var endian = unknownEndian
	switch ld := len(data); {
	case ld != len(dropCRLE(data)):
		endian = littleEndian
	case ld != len(dropCRBE(data)):
		endian = bigEndian
	}
	return data, endian
}

// SplitFunc是Scanner的拆分函数,它返回每行文本,不带任何尾随的行结束标记。
// 返回的行可能为空。行结束标记是一个可选的回车符,后面跟一个必需的换行符。在正则表达式表示中,它是`\r?\n`。
// 即使没有换行符,也将返回输入的最后一个非空行。
func ScanUTF16LinesFunc(byteOrder binary.ByteOrder) (bufio.SplitFunc, func() binary.ByteOrder) {

	// 函数闭包变量
	var endian = unknownEndian
	switch byteOrder {
	case binary.BigEndian:
		endian = bigEndian
	case binary.LittleEndian:
		endian = littleEndian
	}
	const bom = 0xFEFF
	var checkBOM bool = endian == unknownEndian

	// Scanner拆分函数
	splitFunc := func(data []byte, atEOF bool) (advance int, token []byte, err error) {

		if atEOF &amp;&amp; len(data) == 0 {
			return 0, nil, nil
		}

		if checkBOM {
			checkBOM = false
			if len(data) &gt; 1 {
				switch uint16(bom) {
				case uint16(data[0])&lt;&lt;8 | uint16(data[1]):
					endian = bigEndian
					return 2, nil, nil
				case uint16(data[1])&lt;&lt;8 | uint16(data[0]):
					endian = littleEndian
					return 2, nil, nil
				}
			}
		}

		// 扫描以换行符结尾的行。
		i := 0
		for {
			j := bytes.IndexByte(data[i:], &#39;\n&#39;)
			if j &lt; 0 {
				break
			}
			i += j
			switch e := i % 2; e {
			case 1: // UTF-16BE
				if endian != littleEndian {
					if i &gt; 1 {
						if data[i-1] == &#39;\x00&#39; {
							endian = bigEndian
							// 我们有一个完整的以换行符结尾的行。
							return i + 1, dropCRBE(data[0 : i-1]), nil
						}
					}
				}
			case 0: // UTF-16LE
				if endian != bigEndian {
					if i+1 &lt; len(data) {
						i++
						if data[i] == &#39;\x00&#39; {
							endian = littleEndian
							// 我们有一个完整的以换行符结尾的行。
							return i + 1, dropCRLE(data[0 : i-1]), nil
						}
					}
				}
			}
			i++
		}

		// 如果我们在EOF处,我们有一行最后的非终止行。返回它。
		if atEOF {
			// 删除CR。
			advance = len(data)
			switch endian {
			case bigEndian:
				data = dropCRBE(data)
			case littleEndian:
				data = dropCRLE(data)
			default:
				data, endian = dropCR(data)
			}
			if endian == unknownEndian {
				if runtime.GOOS == &quot;windows&quot; {
					endian = littleEndian
				} else {
					endian = bigEndian
				}
			}
			return advance, data, nil
		}

		// 请求更多数据。
		return 0, nil, nil
	}

	// 字节顺序函数
	orderFunc := func() (byteOrder binary.ByteOrder) {
		switch endian {
		case bigEndian:
			byteOrder = binary.BigEndian
		case littleEndian:
			byteOrder = binary.LittleEndian
		}
		return byteOrder
	}

	return splitFunc, orderFunc
}

func main() {
	file, err := os.Open(&quot;utf16.le.txt&quot;)
	if err != nil {
		fmt.Println(err)
		os.Exit(1)
	}
	defer file.Close()
	fmt.Println(file.Name())

	rdr := bufio.NewReader(file)
	scanner := bufio.NewScanner(rdr)
	var bo binary.ByteOrder // unknown, infer from data
	// bo = binary.LittleEndian // windows
	splitFunc, orderFunc := ScanUTF16LinesFunc(bo)
	scanner.Split(splitFunc)

	for scanner.Scan() {
		b := scanner.Bytes()
		s := UTF16BytesToString(b, orderFunc())
		fmt.Println(len(s), s)
		fmt.Println(len(b), b)
	}
	fmt.Println(orderFunc())

	if err := scanner.Err(); err != nil {
		fmt.Println(err)
	}
}

输出:

utf16.le.txt
15 "Hello, 世界"
22 [34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 0 22 78 76 117 34 0]
0 
0 []
15 "Hello, 世界"
22 [34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 0 22 78 76 117 34 0]
LittleEndian

utf16.be.txt
15 "Hello, 世界"
22 [0 34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 78 22 117 76 0 34]
0 
0 []
15 "Hello, 世界"
22 [0 34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 78 22 117 76 0 34]
BigEndian
英文:

UTF16, UTF8, and Byte Order Marks are defined by the Unicode Consortium: UTF-16 FAQ, UTF-8 FAQ, and Byte Order Mark (BOM) FAQ.


> Issue 4802: bufio: reading lines is too cumbersome
>
> Reading lines from a file is too cumbersome in Go.
>
> People are often drawn to bufio.Reader.ReadLine because of its name,
> but it has a weird signature, returning (line []byte, isPrefix bool,
> err error), and requires a lot of work.
>
> ReadSlice and ReadString require a delimiter byte, which is almost
> always the obvious and unsightly '\n', and also can return both a line
> and an EOF


> Revision: f685026a2d38
>
> bufio: new Scanner interface
>
> Add a new, simple interface for scanning (probably textual) data,
> based on a new type called Scanner. It does its own internal
> buffering, so should be plausibly efficient even without injecting a
> bufio.Reader. The format of the input is defined by a "split
> function", by default splitting into lines.


> go1.1beta1 released
>
> You can download binary and source distributions from the usual place:
> https://code.google.com/p/go/downloads/list?q=go1.1beta1


Here's a program which uses the Unicode rules to convert UTF16 text file lines to Go UTF8 encoded strings. The code has been revised to take advantage of the new bufio.Scanner interface in Go 1.1.

package main

import (
	&quot;bufio&quot;
	&quot;bytes&quot;
	&quot;encoding/binary&quot;
	&quot;fmt&quot;
	&quot;os&quot;
	&quot;runtime&quot;
	&quot;unicode/utf16&quot;
	&quot;unicode/utf8&quot;
)

// UTF16BytesToString converts UTF-16 encoded bytes, in big or little endian byte order,
// to a UTF-8 encoded string.
func UTF16BytesToString(b []byte, o binary.ByteOrder) string {
	utf := make([]uint16, (len(b)+(2-1))/2)
	for i := 0; i+(2-1) &lt; len(b); i += 2 {
		utf[i/2] = o.Uint16(b[i:])
	}
	if len(b)/2 &lt; len(utf) {
		utf[len(utf)-1] = utf8.RuneError
	}
	return string(utf16.Decode(utf))
}

// UTF-16 endian byte order
const (
	unknownEndian = iota
	bigEndian
	littleEndian
)

// dropCREndian drops a terminal \r from the endian data.
func dropCREndian(data []byte, t1, t2 byte) []byte {
	if len(data) &gt; 1 {
		if data[len(data)-2] == t1 &amp;&amp; data[len(data)-1] == t2 {
			return data[0 : len(data)-2]
		}
	}
	return data
}

// dropCRBE drops a terminal \r from the big endian data.
func dropCRBE(data []byte) []byte {
	return dropCREndian(data, &#39;\x00&#39;, &#39;\r&#39;)
}

// dropCRLE drops a terminal \r from the little endian data.
func dropCRLE(data []byte) []byte {
	return dropCREndian(data, &#39;\r&#39;, &#39;\x00&#39;)
}

// dropCR drops a terminal \r from the data.
func dropCR(data []byte) ([]byte, int) {
	var endian = unknownEndian
	switch ld := len(data); {
	case ld != len(dropCRLE(data)):
		endian = littleEndian
	case ld != len(dropCRBE(data)):
		endian = bigEndian
	}
	return data, endian
}

// SplitFunc is a split function for a Scanner that returns each line of
// text, stripped of any trailing end-of-line marker. The returned line may
// be empty. The end-of-line marker is one optional carriage return followed
// by one mandatory newline. In regular expression notation, it is `\r?\n`.
// The last non-empty line of input will be returned even if it has no
// newline.
func ScanUTF16LinesFunc(byteOrder binary.ByteOrder) (bufio.SplitFunc, func() binary.ByteOrder) {

	// Function closure variables
	var endian = unknownEndian
	switch byteOrder {
	case binary.BigEndian:
		endian = bigEndian
	case binary.LittleEndian:
		endian = littleEndian
	}
	const bom = 0xFEFF
	var checkBOM bool = endian == unknownEndian

	// Scanner split function
	splitFunc := func(data []byte, atEOF bool) (advance int, token []byte, err error) {

		if atEOF &amp;&amp; len(data) == 0 {
			return 0, nil, nil
		}

		if checkBOM {
			checkBOM = false
			if len(data) &gt; 1 {
				switch uint16(bom) {
				case uint16(data[0])&lt;&lt;8 | uint16(data[1]):
					endian = bigEndian
					return 2, nil, nil
				case uint16(data[1])&lt;&lt;8 | uint16(data[0]):
					endian = littleEndian
					return 2, nil, nil
				}
			}
		}

		// Scan for newline-terminated lines.
		i := 0
		for {
			j := bytes.IndexByte(data[i:], &#39;\n&#39;)
			if j &lt; 0 {
				break
			}
			i += j
			switch e := i % 2; e {
			case 1: // UTF-16BE
				if endian != littleEndian {
					if i &gt; 1 {
						if data[i-1] == &#39;\x00&#39; {
							endian = bigEndian
							// We have a full newline-terminated line.
							return i + 1, dropCRBE(data[0 : i-1]), nil
						}
					}
				}
			case 0: // UTF-16LE
				if endian != bigEndian {
					if i+1 &lt; len(data) {
						i++
						if data[i] == &#39;\x00&#39; {
							endian = littleEndian
							// We have a full newline-terminated line.
							return i + 1, dropCRLE(data[0 : i-1]), nil
						}
					}
				}
			}
			i++
		}

		// If we&#39;re at EOF, we have a final, non-terminated line. Return it.
		if atEOF {
			// drop CR.
			advance = len(data)
			switch endian {
			case bigEndian:
				data = dropCRBE(data)
			case littleEndian:
				data = dropCRLE(data)
			default:
				data, endian = dropCR(data)
			}
			if endian == unknownEndian {
				if runtime.GOOS == &quot;windows&quot; {
					endian = littleEndian
				} else {
					endian = bigEndian
				}
			}
			return advance, data, nil
		}

		// Request more data.
		return 0, nil, nil
	}

	// Endian byte order function
	orderFunc := func() (byteOrder binary.ByteOrder) {
		switch endian {
		case bigEndian:
			byteOrder = binary.BigEndian
		case littleEndian:
			byteOrder = binary.LittleEndian
		}
		return byteOrder
	}

	return splitFunc, orderFunc
}

func main() {
	file, err := os.Open(&quot;utf16.le.txt&quot;)
	if err != nil {
		fmt.Println(err)
		os.Exit(1)
	}
	defer file.Close()
	fmt.Println(file.Name())

	rdr := bufio.NewReader(file)
	scanner := bufio.NewScanner(rdr)
	var bo binary.ByteOrder // unknown, infer from data
	// bo = binary.LittleEndian // windows
	splitFunc, orderFunc := ScanUTF16LinesFunc(bo)
	scanner.Split(splitFunc)

	for scanner.Scan() {
		b := scanner.Bytes()
		s := UTF16BytesToString(b, orderFunc())
		fmt.Println(len(s), s)
		fmt.Println(len(b), b)
	}
	fmt.Println(orderFunc())

	if err := scanner.Err(); err != nil {
		fmt.Println(err)
	}
}

Output:

utf16.le.txt
15 &quot;Hello, 世界&quot;
22 [34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 0 22 78 76 117 34 0]
0 
0 []
15 &quot;Hello, 世界&quot;
22 [34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 0 22 78 76 117 34 0]
LittleEndian

utf16.be.txt
15 &quot;Hello, 世界&quot;
22 [0 34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 78 22 117 76 0 34]
0 
0 []
15 &quot;Hello, 世界&quot;
22 [0 34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 78 22 117 76 0 34]
BigEndian

答案3

得分: 11

这是最简单的读取方法:

package main

import (
	"bufio"
	"fmt"
	"log"
	"os"

	"golang.org/x/text/encoding/unicode"
	"golang.org/x/text/transform"
)

func main() {
	file, err := os.Open("./text.txt")
	if err != nil {
		log.Fatal(err)
	}

	scanner := bufio.NewScanner(transform.NewReader(file, unicode.UTF16(unicode.LittleEndian, unicode.UseBOM).NewDecoder()))
	for scanner.Scan() {
		fmt.Printf(scanner.Text())
	}
}

由于Windows默认使用小端序链接,我们使用unicode.UseBOM策略从文本中获取BOM,并使用unicode.LittleEndian作为备选方案。

英文:

Here is the simplest way to read it:

package main

import (
	&quot;bufio&quot;
	&quot;fmt&quot;
	&quot;log&quot;
	&quot;os&quot;

	&quot;golang.org/x/text/encoding/unicode&quot;
	&quot;golang.org/x/text/transform&quot;
)

func main() {
	file, err := os.Open(&quot;./text.txt&quot;)
	if err != nil {
		log.Fatal(err)
	}

	scanner := bufio.NewScanner(transform.NewReader(file, unicode.UTF16(unicode.LittleEndian, unicode.UseBOM).NewDecoder()))
	for scanner.Scan() {
		fmt.Printf(scanner.Text())
	}
}

since Windows use little-endian order by default link, we use unicode.UseBOM policy to retrieve BOM from the text, and unicode.LittleEndian as a fallback

答案4

得分: 4

package main

import (
"errors"
"fmt"
"log"
"unicode/utf16"
)

func utf16toString(b []uint8) (string, error) {
if len(b)&1 != 0 {
return "", errors.New("len(b) must be even")
}

// Check BOM
var bom int
if len(b) >= 2 {
    switch n := int(b[0])<<8 | int(b[1]); n {
    case 0xfffe:
        bom = 1
        fallthrough
    case 0xfeff:
        b = b[2:]
    }
}

w := make([]uint16, len(b)/2)
for i := range w {
    w[i] = uint16(b[2*i+bom&1])<<8 | uint16(b[2*i+(bom+1)&1])
}
return string(utf16.Decode(w)), nil

}

func main() {
// Simulated data from e.g. a file
b := []byte{255, 254, 91, 0, 83, 0, 99, 0, 114, 0, 105, 0, 112, 0, 116, 0, 32, 0, 73, 0, 110, 0, 102, 0, 111, 0, 93, 0, 13, 0}
s, err := utf16toString(b)
if err != nil {
log.Fatal(err)
}

fmt.Printf("%q", s)

}

英文:

For example:

package main

import (
        &quot;errors&quot;
        &quot;fmt&quot;
        &quot;log&quot;
        &quot;unicode/utf16&quot;
)

func utf16toString(b []uint8) (string, error) {
        if len(b)&amp;1 != 0 {
                return &quot;&quot;, errors.New(&quot;len(b) must be even&quot;)
        }

        // Check BOM
        var bom int
        if len(b) &gt;= 2 {
                switch n := int(b[0])&lt;&lt;8 | int(b[1]); n {
                case 0xfffe:
                        bom = 1
                        fallthrough
                case 0xfeff:
                        b = b[2:]
                }
        }

        w := make([]uint16, len(b)/2)
        for i := range w {
                w[i] = uint16(b[2*i+bom&amp;1])&lt;&lt;8 | uint16(b[2*i+(bom+1)&amp;1])
        }
        return string(utf16.Decode(w)), nil
}

func main() {
        // Simulated data from e.g. a file
        b := []byte{255, 254, 91, 0, 83, 0, 99, 0, 114, 0, 105, 0, 112, 0, 116, 0, 32, 0, 73, 0, 110, 0, 102, 0, 111, 0, 93, 0, 13, 0}
        s, err := utf16toString(b)
        if err != nil {
                log.Fatal(err)
        }

        fmt.Printf(&quot;%q&quot;, s)
}

(Also here)

Output:


&quot;[Script Info]\r&quot;

答案5

得分: 0

如果你想将任何内容打印为字符串,可以使用fmt.Sprint

package main

import (
	"bufio"
	"fmt"
	"os"
)

func main() {
	// 读取整个文件
	f, err := os.Open("test.txt")
	if err != nil {
		fmt.Printf("打开文件时发生错误:%v\n", err)
		return
	}
	r := bufio.NewReader(f)
	var s, _, e = r.ReadLine()
	if e != nil {
		fmt.Println(e)
		return
	}
	fmt.Println(fmt.Sprint(string(s)))
}
英文:

If you want anything to print as a string you could use fmt.Sprint

package main

import (
	&quot;bufio&quot;
	&quot;fmt&quot;
	&quot;os&quot;
)

func main() {
	// read whole the file
	f, err := os.Open(&quot;test.txt&quot;)
	if err != nil {
		fmt.Printf(&quot;error opening file: %v\n&quot;, err)
		return
	}
	r := bufio.NewReader(f)
	var s, _, e = r.ReadLine()
	if e != nil {
		fmt.Println(e)
		return
	}
	fmt.Println(fmt.Sprint(string(s)))
}

huangapple
  • 本文由 发表于 2013年4月3日 17:38:34
  • 转载请务必保留本文链接:https://go.coder-hub.com/15783830.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定