将Unicode转换为GSM编码在Golang中的实现

huangapple go评论85阅读模式
英文:

Converting unicode to gsm encoding in golang

问题

我正在将我的Python项目迁移到Go语言,并且有一个使用情况,需要将UTF-8编码转换为对应的GSM编码(如果可能的话)。我对Go语言非常陌生,如果能提供一些相关的文档或示例代码将非常有帮助。

例如:Python代码片段

ằ作为Unicode -> 在GSM编码后变为a

for character in text:
    if is_gsm(character):
       transliterated_text += character.encode('utf-8')
       continue
    if is_nonascii_utf8(character):
       transliterated_char = unidecode.unidecode(character)
       if transliterated_char == '?' or transliterated_char == '':
          gsm = False
          break
       if transliterated_char != rc:
          character = transliterated_char
          transliterated_text += character
    else:
          transliterated_text += character.encode('utf-8')

if gsm and is_gsm(transliterated_text.decode('utf-8')):
   text = transliterated_text.decode('utf-8')

谢谢。

英文:

I am working on migrating my project in python to golang and I have a use case for converting utf-8 encoding to corresponding gsm ones if possible. I am very new to go, it will be really helpful to get some documentation or examples around it.

For example: Python snippet

ằ as unicode -> a after gsm encoding

for character in text:
    if is_gsm(character):
       transliterated_text += character.encode('utf-8')
       continue
    if is_nonascii_utf8(character):
       transliterated_char = unidecode.unidecode(character)
       if transliterated_char == '?' or transliterated_char == '':
          gsm = False
          break
       if transliterated_char != rc:
          character = transliterated_char
          transliterated_text += character
    else:
          transliterated_text += character.encode('utf-8')

if gsm and is_gsm(transliterated_text.decode('utf-8')):
   text = transliterated_text.decode('utf-8')

Thanks

答案1

得分: 2

你可以按照以下方式进行操作:

package main

import (
	"fmt"
	"regexp"
	"strings"
)

var utf8GsmChars = map[string]string{
	`@`:      "\x00", `£`: "\x01", `$`:      "\x02",
	`¥`: "\x03", `è`: "\x04", `é`: "\x05",
	`ù`: "\x06", `ì`: "\x07", `ò`: "\x08",
	`Ç`: "\x09", `Ø`: "\x0B", `ø`: "\x0C",
	`Å`: "\x0E", `Δ`:      "\x10", `_`:      "\x11",
	`Φ`:      "\x12", `Γ`:      "\x13", `Λ`:      "\x14",
	`Ω`:      "\x15", `Π`:      "\x16", `Ψ`:      "\x17",
	`Σ`:      "\x18", `Θ`:      "\x19", `Ξ`:      "\x1A",
	`Æ`: "\x1C", `æ`: "\x1D", `ß`: "\x1E",
	`É`: "\x1F", `Ä`: "\x5B", `Ö`: "\x5C",
	`Ñ`: "\x5D", `Ü`: "\x5E", `§`: "\x5F",
	`¿`: "\x60", `ä`: "\x7B", `ö`: "\x7C",
	`ñ`: "\x7D", `ü`: "\x7E", `à`: "\x7F",

	`^`: "\x1B\x14`, `{`: "\x1B\x28",
	`}`: "\x1B\x29`, `\`: "\x1B\x2F",
	`[`: "\x1B\x3C`, `~`: "\x1B\x3D",
	`]`: "\x1B\x3E`, `|`: "\x1B\x40",
	`€`: "\x1B\x65",
}

var gsmUtf8Chars = map[string]string{
	"\x00": "\x40",
	"\x01": "\xC2\xA3",
	"\x02": "\x24",
	"\x03": "\xC2\xA5",
	"\x04": "\xC3\xA8",
	"\x05": "\xC3\xA9",
	"\x06": "\xC3\xB9",
	"\x07": "\xC3\xAC",
	"\x08": "\xC3\xB2",
	"\x09": "\xC3\x87",
	"\x0B": "\xC3\x98",
	"\x0C": "\xC3\xB8",
	"\x0E": "\xC3\xB8",
	"\x0F": "\xC3\xA5",
	"\x10": "\xCE\x94",
	"\x11": "\x5F",
	"\x12": "\xCE\xA6",
	"\x13": "\xCE\x93",
	"\x14": "\xCE\xA0",
	"\x15": "\xCE\xA9",
	"\x16": "\xCE\xA0",
	"\x17": "\xCE\xA8",
	"\x18": "\xCE\xA3",
	"\x19": "\xCE\x98",
	"\x1A": "\xCE\x9E",
	"\x1C": "\xC3\x86",
	"\x1D": "\xC3\xA6",
	"\x1E": "\xC3\x9F",
	"\x1F": "\xC3\x89",
	"\x20": "\x20",
	"\x24": "\xC2\xA4",
	"\x40": "\xC2\xA1",
	"\x5B": "\xC3\x84",
	"\x5C": "\xC3\x96",
	"\x5D": "\xC3\x91",
	"\x5E": "\xC3\x9C",
	"\x5F": "\xC2\xA7",
	"\x60": "\xC2\xBF",
	"\x7B": "\xC3\xA8",
	"\x7C": "\xC3\xB6",
	"\x7D": "\xC3\xB1",
	"\x7E": "\xC3\xBC",
	"\x7F": "\xC3\xA0",
}

func UTF8ToGsm0338(text string) string {
	var s string = text

	for k, v := range utf8GsmChars {
		s = strings.Replace(s, k, v, -1)
	}

	re := regexp.MustCompile("[\\x{0080}-\\x{10FFFF}]")
	s = re.ReplaceAllString(s, "?")

	return s
}

func GSM0338ToUTF8(text string) string {
	var s string = text

	for k, v := range gsmUtf8Chars {
		s = strings.Replace(s, k, v, -1)
	}

	return s
}

func main() {
	s := "Hello World"
	gsm := UTF8ToGsm0338(s)
	utf8 := GSM0338ToUTF8(gsm)
	fmt.Printf("word before: %s\nword after gsm: %s\nword after utf8: %s\n", s, gsm, utf8)

}
英文:

You can do it in this way:

package main

import (
        "fmt"
        "regexp"
        "strings"
)

var utf8GsmChars = map[string]string{
        `@`: "\x00", `£`: "\x01", `$`: "\x02",
        `¥`: "\x03", `è`: "\x04", `é`: "\x05",
        `ù`: "\x06", `ì`: "\x07", `ò`: "\x08",
        `Ç`: "\x09", `Ø`: "\x0B", `ø`: "\x0C",
        `Å`: "\x0E", `Δ`: "\x10", `_`: "\x11",
        `Φ`: "\x12", `Γ`: "\x13", `Λ`: "\x14",
        `Ω`: "\x15", `Π`: "\x16", `Ψ`: "\x17",
        `Σ`: "\x18", `Θ`: "\x19", `Ξ`: "\x1A",
        `Æ`: "\x1C", `æ`: "\x1D", `ß`: "\x1E",
        `É`: "\x1F", `Ä`: "\x5B", `Ö`: "\x5C",
        `Ñ`: "\x5D", `Ü`: "\x5E", `§`: "\x5F",
        `¿`: "\x60", `ä`: "\x7B", `ö`: "\x7C",
        `ñ`: "\x7D", `ü`: "\x7E", `à`: "\x7F",

        `^`: "\x1B\x14", `{`: "\x1B\x28",
        `}`: "\x1B\x29", `\`: "\x1B\x2F",
        `[`: "\x1B\x3C", `~`: "\x1B\x3D",
        `]`: "\x1B\x3E", `|`: "\x1B\x40",
        `€`: "\x1B\x65",
}

var gsmUtf8Chars = map[string]string{
        "\x00": "\x40",
        "\x01": "\xC2\xA3",
        "\x02": "\x24",
        "\x03": "\xC2\xA5",
        "\x04": "\xC3\xA8",
        "\x05": "\xC3\xA9",
        "\x06": "\xC3\xB9",
        "\x07": "\xC3\xAC",
        "\x08": "\xC3\xB2",
        "\x09": "\xC3\x87",
        "\x0B": "\xC3\x98",
        "\x0C": "\xC3\xB8",
        "\x0E": "\xC3\xB8",
        "\x0F": "\xC3\xA5",
        "\x10": "\xCE\x94",
        "\x11": "\x5F",
        "\x12": "\xCE\xA6",
        "\x13": "\xCE\x93",
        "\x14": "\xCE\xA0",
        "\x15": "\xCE\xA9",
        "\x16": "\xCE\xA0",
        "\x17": "\xCE\xA8",
        "\x18": "\xCE\xA3",
        "\x19": "\xCE\x98",
        "\x1A": "\xCE\x9E",
        "\x1C": "\xC3\x86",
        "\x1D": "\xC3\xA6",
        "\x1E": "\xC3\x9F",
        "\x1F": "\xC3\x89",
        "\x20": "\x20",
        "\x24": "\xC2\xA4",
        "\x40": "\xC2\xA1",
        "\x5B": "\xC3\x84",
        "\x5C": "\xC3\x96",
        "\x5D": "\xC3\x91",
        "\x5E": "\xC3\x9C",
        "\x5F": "\xC2\xA7",
        "\x60": "\xC2\xBF",
        "\x7B": "\xC3\xA8",
        "\x7C": "\xC3\xB6",
        "\x7D": "\xC3\xB1",
        "\x7E": "\xC3\xBC",
        "\x7F": "\xC3\xA0",
}

func UTF8ToGsm0338(text string) string {
        var s string = text

        for k, v := range utf8GsmChars {
                s = strings.Replace(s, k, v, -1)
        }

        re := regexp.MustCompile("[\\x{0080}-\\x{10FFFF}]")
        s = re.ReplaceAllString(s, "?")

        return s
}

func GSM0338ToUTF8(text string) string {
        var s string = text

        for k, v := range gsmUtf8Chars {
                s = strings.Replace(s, k, v, -1)
        }

        return s
}

func main() {
        s := "Hello World"
        gsm := UTF8ToGsm0338(s)
        utf8 := GSM0338ToUTF8(gsm)
        fmt.Printf("word before: %s\nword after gsm: %s\nword after utf8: %s\n", s, gsm, utf8)

}

huangapple
  • 本文由 发表于 2022年7月28日 17:28:58
  • 转载请务必保留本文链接:https://go.coder-hub.com/73150438.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定