问题

我正在构建一个使用Opus流的Discord机器人。我尝试了各种方法，比如直接发送Opus数据包，将OPUS流解码为PCM并将其编码为字节数组，以及直接将PCM转换为字节数组。在所有情况下，我都收到以下错误信息：

无法识别：code:11 message:"音频数据流传输速度过慢。请以接近实时的速度传输音频数据。"

我尝试了8kHz-48kHz的频率和20毫秒的帧。我还尝试使用最大比特率对转换后的PCM进行编码。我已经成功运行了示例代码，所以我的连接没有问题。我应该在哪里寻找解决方案？

package main

import (
	"fmt"
	// "io"
	"log"
	"os"
	"flag"

	speech "cloud.google.com/go/speech/apiv1"
	"golang.org/x/net/context"
	speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"

	"github.com/bwmarrin/discordgo"
	"layeh.com/gopus"
)

// OnError gets called by dgvoice when an error is encountered.
// By default logs to STDERR
var OnError = func(str string, err error) {
	prefix := "dgVoice: " + str

	if err != nil {
		os.Stderr.WriteString(prefix + ": " + err.Error())
	} else {
		os.Stderr.WriteString(prefix)
	}
}

var stream speechpb.Speech_StreamingRecognizeClient

func main() {
	var (
		Token	  = flag.String("t", "", "Discord bot token.")
//		Email     = flag.String("e", "", "Discord account email.")
//		Password  = flag.String("p", "", "Discord account password.")
		GuildID   = flag.String("g", "", "Guild ID")
		ChannelID = flag.String("c", "", "Channel ID")
	)
	flag.Parse()


	fmt.Println("Connecting to Discord...")
	// Connect to Discord
	discord, err := discordgo.New(*Token)
	if err != nil {
		fmt.Println(err)
		return
	}

	fmt.Println("Opening Socket...")
	// Open Websocket
	err = discord.Open()
	if err != nil {
		fmt.Println(err)
		return
	}

	fmt.Println("Joining Channel...")
	// Connect to voice channel.
	// NOTE: Setting mute to false, deaf to true.
	dgv, err := discord.ChannelVoiceJoin(*GuildID, *ChannelID, false, false)
	if err != nil {
		fmt.Println(err)
		return
	}

	fmt.Println("Connecting to Google Speech Recognition API...")
	ctx := context.Background()

	// [START speech_streaming_mic_recognize]
	client, err := speech.NewClient(ctx)
	if err != nil {
		log.Fatal(err)
	}
	stream, err = client.StreamingRecognize(ctx)
	if err != nil {
		log.Fatal(err)
	}
	// Send the initial configuration message.
	if err := stream.Send(&speechpb.StreamingRecognizeRequest{
		StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{
			StreamingConfig: &speechpb.StreamingRecognitionConfig{
				Config: &speechpb.RecognitionConfig{
					Encoding:        speechpb.RecognitionConfig_LINEAR16,
					SampleRateHertz: 16000,
					LanguageCode:    "en-US",
				},
				//InterimResults: true,
				SingleUtterance: true,
			},

		},
	}); err != nil {
		log.Fatal(err)
	}


	recv := make(chan *discordgo.Packet, 2)
	go Receive(dgv, recv)

	send := make(chan []int16, 2)
	go Send(dgv, send)

	// dgv.Speaking(true)
 // 	defer dgv.Speaking(false)

	go func() {
		for {

			p, ok := <-recv
			if !ok {
				fmt.Println("Not OK")
				return
			}

			send <- p.PCM
		}
		
	} ()

	for {
		resp, err := stream.Recv()
		//fmt.Printf("%+v\n",resp)
		if err != nil {
			log.Fatalf("Cannot stream results: %v", err)
		}
		if err := resp.Error; err != nil {
			log.Fatalf("Could not recognize: %v", err)
		}
		for _, result := range resp.Results {
			fmt.Printf("Result: %+v\n", result)
		}
	}
	
	// Close connections
	dgv.Close()
	discord.Close()

	return
}

func Receive(v *discordgo.VoiceConnection, c chan *discordgo.Packet) {
	var speakers    map[uint32]*gopus.Decoder
	if c == nil {
		return
	}

	var err error
	for {
		p, ok := <-v.OpusRecv
		if !ok {
			return
		}

		if speakers == nil {
			speakers = make(map[uint32]*gopus.Decoder)
		}

		_, ok = speakers[p.SSRC]
		if !ok {
			speakers[p.SSRC], err = gopus.NewDecoder(16000, 1)
			if err != nil {
				OnError("error creating opus decoder", err)
				continue
			}
		}
		p.PCM, err = speakers[p.SSRC].Decode(p.Opus, 320, false)
		if err != nil {
			OnError("Error decoding opus data", err)
			continue
		}

		// try encoding pcm frame with Opus
		
		c <- p
	}
}

func Send(v *discordgo.VoiceConnection,  pcm <- chan []int16) {
	for {
		
		// read pcm from chan, exit if channel is closed.
		recv, ok := <-pcm
		if !ok {
			OnError("PCM Channel closed", nil)
			return
		}


		buf := make([]byte,2*len(recv))

		for i := 0; i < len(recv); i+=2 {
			var h, l uint8 = uint8(i>>8), uint8(i&0xff)
			buf[i] = h
			buf[i+1] = l
		}
		

		stream.Send(&speechpb.StreamingRecognizeRequest{
			StreamingRequest: &speechpb.StreamingRecognizeRequest_AudioContent{
				AudioContent: buf,
			},
		});
	}

}

英文:

I'm building a discord bot that uses an Opus stream. I've tried various things such as sending the Opus packets up directly, decoding the OPUS stream to a PCM and encoding it into a byte array, and converting the PCM to a byte array directly. In all cases I get:

Could not recognize: code:11 message:"Audio data is being streamed too slow. Please stream audio data approximately at real time."

I've tried 8kHz-48kHz frequencies at 20 ms frames. I've also tried to encode the converted PCM with the max bitrate. I have run the sample code successfully, so there is no connection issue on my end. Where should I look for a solution?

package main
import (
&quot;fmt&quot;
//&quot;io&quot;
&quot;log&quot;
&quot;os&quot;
&quot;flag&quot;
speech &quot;cloud.google.com/go/speech/apiv1&quot;
&quot;golang.org/x/net/context&quot;
speechpb &quot;google.golang.org/genproto/googleapis/cloud/speech/v1&quot;
&quot;github.com/bwmarrin/discordgo&quot;
&quot;layeh.com/gopus&quot;
)
// OnError gets called by dgvoice when an error is encountered.
// By default logs to STDERR
var OnError = func(str string, err error) {
prefix := &quot;dgVoice: &quot; + str
if err != nil {
os.Stderr.WriteString(prefix + &quot;: &quot; + err.Error())
} else {
os.Stderr.WriteString(prefix)
}
}
var stream speechpb.Speech_StreamingRecognizeClient
func main() {
var (
Token	  = flag.String(&quot;t&quot;, &quot;&quot;, &quot;Discord bot token.&quot;)
//		Email     = flag.String(&quot;e&quot;, &quot;&quot;, &quot;Discord account email.&quot;)
//		Password  = flag.String(&quot;p&quot;, &quot;&quot;, &quot;Discord account password.&quot;)
GuildID   = flag.String(&quot;g&quot;, &quot;&quot;, &quot;Guild ID&quot;)
ChannelID = flag.String(&quot;c&quot;, &quot;&quot;, &quot;Channel ID&quot;)
)
flag.Parse()
fmt.Println(&quot;Connecting to Discord...&quot;)
// Connect to Discord
discord, err := discordgo.New(*Token)
if err != nil {
fmt.Println(err)
return
}
fmt.Println(&quot;Opening Socket...&quot;)
// Open Websocket
err = discord.Open()
if err != nil {
fmt.Println(err)
return
}
fmt.Println(&quot;Joining Channel...&quot;)
// Connect to voice channel.
// NOTE: Setting mute to false, deaf to true.
dgv, err := discord.ChannelVoiceJoin(*GuildID, *ChannelID, false, false)
if err != nil {
fmt.Println(err)
return
}
fmt.Println(&quot;Connecting to Google Speech Recognition API...&quot;)
ctx := context.Background()
// [START speech_streaming_mic_recognize]
client, err := speech.NewClient(ctx)
if err != nil {
log.Fatal(err)
}
stream, err = client.StreamingRecognize(ctx)
if err != nil {
log.Fatal(err)
}
// Send the initial configuration message.
if err := stream.Send(&amp;speechpb.StreamingRecognizeRequest{
StreamingRequest: &amp;speechpb.StreamingRecognizeRequest_StreamingConfig{
StreamingConfig: &amp;speechpb.StreamingRecognitionConfig{
Config: &amp;speechpb.RecognitionConfig{
Encoding:        speechpb.RecognitionConfig_LINEAR16,
SampleRateHertz: 16000,
LanguageCode:    &quot;en-US&quot;,
},
//InterimResults: true,
SingleUtterance: true,
},
},
}); err != nil {
log.Fatal(err)
}
recv := make(chan *discordgo.Packet, 2)
go Receive(dgv, recv)
send := make(chan []int16, 2)
go Send(dgv, send)
// dgv.Speaking(true)
// 	defer dgv.Speaking(false)
go func() {
for {
p, ok := &lt;-recv
if !ok {
fmt.Println(&quot;Not OK&quot;)
return
}
send &lt;- p.PCM
}
} ()
for {
resp, err := stream.Recv()
//fmt.Printf(&quot;%+v\n&quot;,resp)
if err != nil {
log.Fatalf(&quot;Cannot stream results: %v&quot;, err)
}
if err := resp.Error; err != nil {
log.Fatalf(&quot;Could not recognize: %v&quot;, err)
}
for _, result := range resp.Results {
fmt.Printf(&quot;Result: %+v\n&quot;, result)
}
}
// Close connections
dgv.Close()
discord.Close()
return
}
func Receive(v *discordgo.VoiceConnection, c chan *discordgo.Packet) {
var speakers    map[uint32]*gopus.Decoder
if c == nil {
return
}
var err error
for {
p, ok := &lt;-v.OpusRecv
if !ok {
return
}
if speakers == nil {
speakers = make(map[uint32]*gopus.Decoder)
}
_, ok = speakers[p.SSRC]
if !ok {
speakers[p.SSRC], err = gopus.NewDecoder(16000, 1)
if err != nil {
OnError(&quot;error creating opus decoder&quot;, err)
continue
}
}
p.PCM, err = speakers[p.SSRC].Decode(p.Opus, 320, false)
if err != nil {
OnError(&quot;Error decoding opus data&quot;, err)
continue
}
// try encoding pcm frame with Opus
c &lt;- p
}
}
func Send(v *discordgo.VoiceConnection,  pcm &lt;- chan []int16) {
for {
// read pcm from chan, exit if channel is closed.
recv, ok := &lt;-pcm
if !ok {
OnError(&quot;PCM Channel closed&quot;, nil)
return
}
buf := make([]byte,2*len(recv))
for i := 0; i &lt; len(recv); i+=2 {
var h, l uint8 = uint8(i&gt;&gt;8), uint8(i&amp;0xff)
buf[i] = h
buf[i+1] = l
}
stream.Send(&amp;speechpb.StreamingRecognizeRequest{
StreamingRequest: &amp;speechpb.StreamingRecognizeRequest_AudioContent{
AudioContent: buf,
},
});
}

}

答案1

得分: 2

Google语音转文本文档中提供了一个完全可用的Go语言流式语音识别示例。

当服务器没有实时接收到音频时，会发送“音频数据传输速度过慢”的消息。在这种情况下，上述代码存在一个错误，导致在每次发送循环的迭代中只发送了半个PCM帧：

        for i := 0; i < len(recv); i+=2 {
            var h, l uint8 = uint8(i>>8), uint8(i&0xff)
            buf[i] = h
            buf[i+1] = l
        }

recv是一个int16值的切片，因此应该逐个值进行迭代，而不是使用i+=2，这样会跳过每个其他值。buf是一个uint8切片，所以对它的索引是有效的。

英文:

The Google Speech-to-Text documentation has a fully working example of streaming speech recognition in Go.

"Audio data is being streamed too slow" is sent by the server when it is not receiving audio in realtime. In this case, the above code contains a bug that results in only half a PCM frame being sent on each iteration of the Send loop:

        for i := 0; i &lt; len(recv); i+=2 {
var h, l uint8 = uint8(i&gt;&gt;8), uint8(i&amp;0xff)
buf[i] = h
buf[i+1] = l
}

recv is a slice of int16 values, so it should be iterated over one value at a time, not i+=2, which skips every other value. buf is a uint8 slice, so the indexing for that is valid.

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

如何在golang中对音频流进行编码以进行Google语音识别？

问题

答案1

在GoLang服务器上的开发分支，多个listenAndServes

如何在 Go 项目的流水线中检测已弃用的直接依赖项

在Go编程语言中，“方法需要指针接收器”。

Go SQL, scanning a row as a slice?

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

发表评论