如何在golang中对音频流进行编码以进行Google语音识别?

huangapple go评论91阅读模式
英文:

How should I encode an audio stream for Google Voice recognition in golang?

问题

我正在构建一个使用Opus流的Discord机器人。我尝试了各种方法,比如直接发送Opus数据包,将OPUS流解码为PCM并将其编码为字节数组,以及直接将PCM转换为字节数组。在所有情况下,我都收到以下错误信息:

无法识别:code:11 message:"音频数据流传输速度过慢。请以接近实时的速度传输音频数据。"

我尝试了8kHz-48kHz的频率和20毫秒的帧。我还尝试使用最大比特率对转换后的PCM进行编码。我已经成功运行了示例代码,所以我的连接没有问题。我应该在哪里寻找解决方案?

package main

import (
	"fmt"
	// "io"
	"log"
	"os"
	"flag"

	speech "cloud.google.com/go/speech/apiv1"
	"golang.org/x/net/context"
	speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"

	"github.com/bwmarrin/discordgo"
	"layeh.com/gopus"
)

// OnError gets called by dgvoice when an error is encountered.
// By default logs to STDERR
var OnError = func(str string, err error) {
	prefix := "dgVoice: " + str

	if err != nil {
		os.Stderr.WriteString(prefix + ": " + err.Error())
	} else {
		os.Stderr.WriteString(prefix)
	}
}

var stream speechpb.Speech_StreamingRecognizeClient

func main() {
	var (
		Token	  = flag.String("t", "", "Discord bot token.")
//		Email     = flag.String("e", "", "Discord account email.")
//		Password  = flag.String("p", "", "Discord account password.")
		GuildID   = flag.String("g", "", "Guild ID")
		ChannelID = flag.String("c", "", "Channel ID")
	)
	flag.Parse()


	fmt.Println("Connecting to Discord...")
	// Connect to Discord
	discord, err := discordgo.New(*Token)
	if err != nil {
		fmt.Println(err)
		return
	}

	fmt.Println("Opening Socket...")
	// Open Websocket
	err = discord.Open()
	if err != nil {
		fmt.Println(err)
		return
	}

	fmt.Println("Joining Channel...")
	// Connect to voice channel.
	// NOTE: Setting mute to false, deaf to true.
	dgv, err := discord.ChannelVoiceJoin(*GuildID, *ChannelID, false, false)
	if err != nil {
		fmt.Println(err)
		return
	}

	fmt.Println("Connecting to Google Speech Recognition API...")
	ctx := context.Background()

	// [START speech_streaming_mic_recognize]
	client, err := speech.NewClient(ctx)
	if err != nil {
		log.Fatal(err)
	}
	stream, err = client.StreamingRecognize(ctx)
	if err != nil {
		log.Fatal(err)
	}
	// Send the initial configuration message.
	if err := stream.Send(&speechpb.StreamingRecognizeRequest{
		StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{
			StreamingConfig: &speechpb.StreamingRecognitionConfig{
				Config: &speechpb.RecognitionConfig{
					Encoding:        speechpb.RecognitionConfig_LINEAR16,
					SampleRateHertz: 16000,
					LanguageCode:    "en-US",
				},
				//InterimResults: true,
				SingleUtterance: true,
			},

		},
	}); err != nil {
		log.Fatal(err)
	}


	recv := make(chan *discordgo.Packet, 2)
	go Receive(dgv, recv)

	send := make(chan []int16, 2)
	go Send(dgv, send)

	// dgv.Speaking(true)
 // 	defer dgv.Speaking(false)

	go func() {
		for {

			p, ok := <-recv
			if !ok {
				fmt.Println("Not OK")
				return
			}

			send <- p.PCM
		}
		
	} ()

	for {
		resp, err := stream.Recv()
		//fmt.Printf("%+v\n",resp)
		if err != nil {
			log.Fatalf("Cannot stream results: %v", err)
		}
		if err := resp.Error; err != nil {
			log.Fatalf("Could not recognize: %v", err)
		}
		for _, result := range resp.Results {
			fmt.Printf("Result: %+v\n", result)
		}
	}
	
	// Close connections
	dgv.Close()
	discord.Close()

	return
}

func Receive(v *discordgo.VoiceConnection, c chan *discordgo.Packet) {
	var speakers    map[uint32]*gopus.Decoder
	if c == nil {
		return
	}

	var err error
	for {
		p, ok := <-v.OpusRecv
		if !ok {
			return
		}

		if speakers == nil {
			speakers = make(map[uint32]*gopus.Decoder)
		}

		_, ok = speakers[p.SSRC]
		if !ok {
			speakers[p.SSRC], err = gopus.NewDecoder(16000, 1)
			if err != nil {
				OnError("error creating opus decoder", err)
				continue
			}
		}
		p.PCM, err = speakers[p.SSRC].Decode(p.Opus, 320, false)
		if err != nil {
			OnError("Error decoding opus data", err)
			continue
		}

		// try encoding pcm frame with Opus
		
		c <- p
	}
}

func Send(v *discordgo.VoiceConnection,  pcm <- chan []int16) {
	for {
		
		// read pcm from chan, exit if channel is closed.
		recv, ok := <-pcm
		if !ok {
			OnError("PCM Channel closed", nil)
			return
		}


		buf := make([]byte,2*len(recv))

		for i := 0; i < len(recv); i+=2 {
			var h, l uint8 = uint8(i>>8), uint8(i&0xff)
			buf[i] = h
			buf[i+1] = l
		}
		

		stream.Send(&speechpb.StreamingRecognizeRequest{
			StreamingRequest: &speechpb.StreamingRecognizeRequest_AudioContent{
				AudioContent: buf,
			},
		});
	}

}
英文:

I'm building a discord bot that uses an Opus stream. I've tried various things such as sending the Opus packets up directly, decoding the OPUS stream to a PCM and encoding it into a byte array, and converting the PCM to a byte array directly. In all cases I get:

Could not recognize: code:11 message:"Audio data is being streamed too slow. Please stream audio data approximately at real time."

I've tried 8kHz-48kHz frequencies at 20 ms frames. I've also tried to encode the converted PCM with the max bitrate. I have run the sample code successfully, so there is no connection issue on my end. Where should I look for a solution?

package main
import (
&quot;fmt&quot;
//&quot;io&quot;
&quot;log&quot;
&quot;os&quot;
&quot;flag&quot;
speech &quot;cloud.google.com/go/speech/apiv1&quot;
&quot;golang.org/x/net/context&quot;
speechpb &quot;google.golang.org/genproto/googleapis/cloud/speech/v1&quot;
&quot;github.com/bwmarrin/discordgo&quot;
&quot;layeh.com/gopus&quot;
)
// OnError gets called by dgvoice when an error is encountered.
// By default logs to STDERR
var OnError = func(str string, err error) {
prefix := &quot;dgVoice: &quot; + str
if err != nil {
os.Stderr.WriteString(prefix + &quot;: &quot; + err.Error())
} else {
os.Stderr.WriteString(prefix)
}
}
var stream speechpb.Speech_StreamingRecognizeClient
func main() {
var (
Token	  = flag.String(&quot;t&quot;, &quot;&quot;, &quot;Discord bot token.&quot;)
//		Email     = flag.String(&quot;e&quot;, &quot;&quot;, &quot;Discord account email.&quot;)
//		Password  = flag.String(&quot;p&quot;, &quot;&quot;, &quot;Discord account password.&quot;)
GuildID   = flag.String(&quot;g&quot;, &quot;&quot;, &quot;Guild ID&quot;)
ChannelID = flag.String(&quot;c&quot;, &quot;&quot;, &quot;Channel ID&quot;)
)
flag.Parse()
fmt.Println(&quot;Connecting to Discord...&quot;)
// Connect to Discord
discord, err := discordgo.New(*Token)
if err != nil {
fmt.Println(err)
return
}
fmt.Println(&quot;Opening Socket...&quot;)
// Open Websocket
err = discord.Open()
if err != nil {
fmt.Println(err)
return
}
fmt.Println(&quot;Joining Channel...&quot;)
// Connect to voice channel.
// NOTE: Setting mute to false, deaf to true.
dgv, err := discord.ChannelVoiceJoin(*GuildID, *ChannelID, false, false)
if err != nil {
fmt.Println(err)
return
}
fmt.Println(&quot;Connecting to Google Speech Recognition API...&quot;)
ctx := context.Background()
// [START speech_streaming_mic_recognize]
client, err := speech.NewClient(ctx)
if err != nil {
log.Fatal(err)
}
stream, err = client.StreamingRecognize(ctx)
if err != nil {
log.Fatal(err)
}
// Send the initial configuration message.
if err := stream.Send(&amp;speechpb.StreamingRecognizeRequest{
StreamingRequest: &amp;speechpb.StreamingRecognizeRequest_StreamingConfig{
StreamingConfig: &amp;speechpb.StreamingRecognitionConfig{
Config: &amp;speechpb.RecognitionConfig{
Encoding:        speechpb.RecognitionConfig_LINEAR16,
SampleRateHertz: 16000,
LanguageCode:    &quot;en-US&quot;,
},
//InterimResults: true,
SingleUtterance: true,
},
},
}); err != nil {
log.Fatal(err)
}
recv := make(chan *discordgo.Packet, 2)
go Receive(dgv, recv)
send := make(chan []int16, 2)
go Send(dgv, send)
// dgv.Speaking(true)
// 	defer dgv.Speaking(false)
go func() {
for {
p, ok := &lt;-recv
if !ok {
fmt.Println(&quot;Not OK&quot;)
return
}
send &lt;- p.PCM
}
} ()
for {
resp, err := stream.Recv()
//fmt.Printf(&quot;%+v\n&quot;,resp)
if err != nil {
log.Fatalf(&quot;Cannot stream results: %v&quot;, err)
}
if err := resp.Error; err != nil {
log.Fatalf(&quot;Could not recognize: %v&quot;, err)
}
for _, result := range resp.Results {
fmt.Printf(&quot;Result: %+v\n&quot;, result)
}
}
// Close connections
dgv.Close()
discord.Close()
return
}
func Receive(v *discordgo.VoiceConnection, c chan *discordgo.Packet) {
var speakers    map[uint32]*gopus.Decoder
if c == nil {
return
}
var err error
for {
p, ok := &lt;-v.OpusRecv
if !ok {
return
}
if speakers == nil {
speakers = make(map[uint32]*gopus.Decoder)
}
_, ok = speakers[p.SSRC]
if !ok {
speakers[p.SSRC], err = gopus.NewDecoder(16000, 1)
if err != nil {
OnError(&quot;error creating opus decoder&quot;, err)
continue
}
}
p.PCM, err = speakers[p.SSRC].Decode(p.Opus, 320, false)
if err != nil {
OnError(&quot;Error decoding opus data&quot;, err)
continue
}
// try encoding pcm frame with Opus
c &lt;- p
}
}
func Send(v *discordgo.VoiceConnection,  pcm &lt;- chan []int16) {
for {
// read pcm from chan, exit if channel is closed.
recv, ok := &lt;-pcm
if !ok {
OnError(&quot;PCM Channel closed&quot;, nil)
return
}
buf := make([]byte,2*len(recv))
for i := 0; i &lt; len(recv); i+=2 {
var h, l uint8 = uint8(i&gt;&gt;8), uint8(i&amp;0xff)
buf[i] = h
buf[i+1] = l
}
stream.Send(&amp;speechpb.StreamingRecognizeRequest{
StreamingRequest: &amp;speechpb.StreamingRecognizeRequest_AudioContent{
AudioContent: buf,
},
});
}

}

答案1

得分: 2

Google语音转文本文档中提供了一个完全可用的Go语言流式语音识别示例。

当服务器没有实时接收到音频时,会发送“音频数据传输速度过慢”的消息。在这种情况下,上述代码存在一个错误,导致在每次发送循环的迭代中只发送了半个PCM帧:

        for i := 0; i < len(recv); i+=2 {
            var h, l uint8 = uint8(i>>8), uint8(i&0xff)
            buf[i] = h
            buf[i+1] = l
        }

recv是一个int16值的切片,因此应该逐个值进行迭代,而不是使用i+=2,这样会跳过每个其他值。buf是一个uint8切片,所以对它的索引是有效的。

英文:

The Google Speech-to-Text documentation has a fully working example of streaming speech recognition in Go.

"Audio data is being streamed too slow" is sent by the server when it is not receiving audio in realtime. In this case, the above code contains a bug that results in only half a PCM frame being sent on each iteration of the Send loop:

        for i := 0; i &lt; len(recv); i+=2 {
var h, l uint8 = uint8(i&gt;&gt;8), uint8(i&amp;0xff)
buf[i] = h
buf[i+1] = l
}

recv is a slice of int16 values, so it should be iterated over one value at a time, not i+=2, which skips every other value. buf is a uint8 slice, so the indexing for that is valid.


huangapple
  • 本文由 发表于 2017年9月7日 09:13:44
  • 转载请务必保留本文链接:https://go.coder-hub.com/46086413.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定