英文:
Bad connection response to long-running MSSQL transaction in Golang
问题
我有一个请求者,用于管理针对Azure SQL数据库的SQL查询。负责事务查询的函数如下所示:
import (
"context"
"database/sql"
"fmt"
"log"
"strings"
"time"
"github.com/cenkalti/backoff"
_ "github.com/denisenkom/go-mssqldb" // 需要导入SQL驱动程序,以便告诉Golang如何解释我们的请求
)
// 在查询和变量上使用上下文执行单个带事务的Exec的辅助函数。
// 如果有任何失败,此函数将返回错误。
func (requester *Requester) doTransaction(ctx context.Context,
isolation sql.IsolationLevel, txFunc func(*sql.Tx) error) error {
// 首先,获取数据库连接;如果失败,则返回错误
conn, err := requester.getConn(ctx)
if err != nil {
return err
}
// 在继续之前,确保连接关闭并返回到连接池
defer func() {
if err := conn.Close(); err != nil {
log.Printf("关闭失败,错误:%v", err)
}
}()
// 接下来,使用给定的上下文和默认隔离级别启动事务
tx, err := requester.getTx(ctx, conn, isolation)
if err != nil {
return err
}
// 现在,在函数结束之前,确保事务在回滚或提交之前完成
var tErr error
defer func() {
if p := recover(); p != nil {
tx.Rollback()
panic(p)
} else if tErr != nil {
log.Printf("发生错误:%v", tErr)
if err := tx.Rollback(); err != nil {
log.Printf("回滚失败,错误:%v", err)
}
} else {
if tErr := tx.Commit(); tErr != nil {
log.Printf("提交失败,错误:%v", tErr)
}
}
}()
// 最后,运行函数并返回结果
tErr = txFunc(tx)
return tErr
}
// 使用备份和重试获取与数据库的连接的辅助函数
func (requester *Requester) getConn(ctx context.Context) (*sql.Conn, error) {
// 创建一个对象,指定何时以及如何进行重试
// 我们当前希望指数回退,最多重试5次
repeater := backoff.WithContext(backoff.WithMaxRetries(
backoff.NewExponentialBackOff(), 5), ctx)
// 使用500毫秒的等待时间和最多5次重试进行重试操作
// 并返回其中的操作结果
var conn *sql.Conn
if err := backoff.Retry(func() error {
// 尝试获取与数据库的连接
var err error
if conn, err = requester.conn.Conn(ctx); err != nil {
// 获取连接失败;如果有登录错误、EOF或握手失败,
// 则稍后再次尝试连接,只需返回并让回退代码处理
log.Printf("连接失败,错误:%v", err)
if isLoginError(err, requester.serverName, requester.databaseName) {
return err
} else if strings.Contains(err.Error(), "EOF") {
return err
} else if strings.Contains(err.Error(), "TLS Handshake failed") {
return err
}
// 否则,无法从错误中恢复,因此返回错误
return backoff.Permanent(err)
}
return nil
}, repeater); err != nil {
return nil, err
}
return conn, nil
}
// 启动与数据库的事务的辅助函数
func (requester *Requester) getTx(ctx context.Context, conn *sql.Conn,
isolation sql.IsolationLevel) (*sql.Tx, error) {
// 创建一个对象,指定何时以及如何进行重试
// 我们当前希望指数回退,最多重试5次
repeater := backoff.WithContext(backoff.WithMaxRetries(
backoff.NewExponentialBackOff(), 5), ctx)
// 使用500毫秒的等待时间和最多5次重试进行重试操作
// 并返回其中的操作结果
var tx *sql.Tx
if err := backoff.Retry(func() error {
// 尝试使用给定的上下文和默认隔离级别启动事务
var err error
if tx, err = conn.BeginTx(ctx, &sql.TxOptions{Isolation: isolation, ReadOnly: false}); err != nil {
// 创建事务失败;如果有连接错误,则稍后尝试连接,只需返回并让回退代码处理
if strings.Contains(err.Error(), "An existing connection was forcibly closed by the remote host.") ||
strings.Contains(err.Error(), "bad connection") {
log.Printf("BeginTx失败,错误:%v。正在重试...", err)
return err
}
// 否则,无法从错误中恢复,因此返回错误
log.Printf("在尝试创建事务时发生未知/未捕获的异常,错误:%v", err)
return backoff.Permanent(err)
}
return nil
}, repeater); err != nil {
return nil, err
}
return tx, nil
}
请求者对象包装了一个sql.Db
,可以按以下方式创建:
// 首先,从端点、端口、用户名、密码和数据库名创建连接字符串
connString := fmt.Sprintf("server=%s;user id=%s;password=%s;port=%s;database=%s;connection timeout=30",
endpoint, dbUser, dbPassword, port, dbName)
// 最后,尝试连接到数据库。如果失败,则返回错误
db, err := sql.Open("sqlserver", connString)
if err != nil {
return nil, err
}
// 确保我们的连接以及重复使用,以避免坏连接和I/O超时
db.SetMaxOpenConns(20)
db.SetConnMaxLifetime(10 * time.Minute)
db.SetConnMaxIdleTime(10 * time.Minute)
总体而言,这个方法效果很好。我注意到的一个问题是,当两个请求之间的时间间隔很长时,第一次重试会出现i/o timeout
错误,然后后续重试会出现bad connection
错误,最终导致失败。我认为问题与此错误有关。基本上,似乎Microsoft在30分钟后使空闲请求无效。然而,由于我将最大空闲时间设置为10分钟,这不应该是问题。
这里发生了什么,我该如何解决这个问题?
英文:
I have a requester that manages my SQL queries against an Azure SQL database. The function responsible for transaction queries is as follows:
import (
"context"
"database/sql"
"fmt"
"log"
"strings"
"time"
"github.com/cenkalti/backoff"
_ "github.com/denisenkom/go-mssqldb" // Need to import the SQL driver so we can tell Golang how to interpret our requests
)
// Helper function that does a single Exec with a transaction with a context on a query and variables.
// This function will return an error if there are any failures
func (requester *Requester) doTransaction(ctx context.Context,
isolation sql.IsolationLevel, txFunc func(*sql.Tx) error) error {
// First, get the database connection; if this fails then return an error
conn, err := requester.getConn(ctx)
if err != nil {
return err
}
// Before we continue on, ensure that the connection is clsoed and returned to the connection pool
defer func() {
if err := conn.Close(); err != nil {
log.Printf("Close failed, error: %v", err)
}
}()
// Next, start the transaction with the given context and the default isolation
tx, err := requester.getTx(ctx, conn, isolation)
if err != nil {
return err
}
// Now, ensure that the transaction is either rolled back or committed before
// the function ends
var tErr error
defer func() {
if p := recover(); p != nil {
tx.Rollback()
panic(p)
} else if tErr != nil {
log.Printf("An error occurred: %v", tErr)
if err := tx.Rollback(); err != nil {
log.Printf("Rollback failed, error: %v", err)
}
} else {
if tErr := tx.Commit(); tErr != nil {
log.Printf("Commit failed, error: %v", tErr)
}
}
}()
// Finally, run the function and return the result
tErr = txFunc(tx)
return tErr
}
// Helper function that gets a connection to the database with a backup and retry
func (requester *Requester) getConn(ctx context.Context) (*sql.Conn, error) {
// Create an object that will dictate how and when the retries are done
// We currently want an exponential backoff that retries a maximum of 5 times
repeater := backoff.WithContext(backoff.WithMaxRetries(
backoff.NewExponentialBackOff(), 5), ctx)
// Do a retry operation with a 500ms wait time and a maximum of 5 retries
// and return the result of the operation therein
var conn *sql.Conn
if err := backoff.Retry(func() error {
// Attempt to get the connection to the database
var err error
if conn, err = requester.conn.Conn(ctx); err != nil {
// We failed to get the connection; if we have a login error, an EOF or handshake
// failure then we'll attempt the connection again later so just return it and let
// the backoff code handle it
log.Printf("Conn failed, error: %v", err)
if isLoginError(err, requester.serverName, requester.databaseName) {
return err
} else if strings.Contains(err.Error(), "EOF") {
return err
} else if strings.Contains(err.Error(), "TLS Handshake failed") {
return err
}
// Otherwise, we can't recover from the error so return it
return backoff.Permanent(err)
}
return nil
}, repeater); err != nil {
return nil, err
}
return conn, nil
}
// Helper function that starts a transaction against the database
func (requester *Requester) getTx(ctx context.Context, conn *sql.Conn,
isolation sql.IsolationLevel) (*sql.Tx, error) {
// Create an object that will dictate how and when the retries are done
// We currently want an exponential backoff that retries a maximum of 5 times
repeater := backoff.WithContext(backoff.WithMaxRetries(
backoff.NewExponentialBackOff(), 5), ctx)
// Do a retry operation with a 500ms wait time and a maximum of 5 retries
// and return the result of the operation therein
var tx *sql.Tx
if err := backoff.Retry(func() error {
// Attempt to start the transaction with the given context and the default isolation
var err error
if tx, err = conn.BeginTx(ctx, &sql.TxOptions{Isolation: isolation, ReadOnly: false}); err != nil {
// We failed to create the transaction; if we have a connection error then we'll
// attempt the connection again later so just return it and let the backoff code handle it
if strings.Contains(err.Error(), "An existing connection was forcibly closed by the remote host.") ||
strings.Contains(err.Error(), "bad connection") {
log.Printf("BeginTx failed, error: %v. Retrying...", err)
return err
}
// Otherwise, we can't recover from the error so return it
log.Printf("Unknown/uncaught exception when attempting to create a transaction, error: %v", err)
return backoff.Permanent(err)
}
return nil
}, repeater); err != nil {
return nil, err
}
return tx, nil
}
The requester object wraps an sql.Db
and is created like this:
// First, create a connection string from the endpoint, port, user name, password and database name
connString := fmt.Sprintf("server=%s;user id=%s;password=%s;port=%s;database=%s;connection timeout=30",
endpoint, dbUser, dbPassword, port, dbName)
// Finally, attempt to connect to the database. If this fails then return an error
db, err := sql.Open("sqlserver", connString)
if err != nil {
return nil, err
}
// Ensure that our connections are used and reused in such a way
// as to avoid bad connections and I/O timeouts
db.SetMaxOpenConns(20)
db.SetConnMaxLifetime(10 * time.Minute)
db.SetConnMaxIdleTime(10 * time.Minute)
On the whole this works well. The one problem I've noticed is that, when a long time has elapsed between individual requests, then I'll get an i/o timeout
error on the first retry and then bad connection
errors on subsequent retries, ultimately resulting in failure. My thought is that the problem is related to this bug. Essentially, it appears that Microsoft invalidates idle requests after 30 minutes. However, as I have the maximum idle time set to 10 minutes, this shouldn't be the problem.
What's going on here and how do I fix this issue?
答案1
得分: 1
经过一些调查,我发现在30分钟的时间窗口后,数据库连接会变得陈旧,修改连接池的生命周期或空闲时间并不能真正解决这个问题。因此,为了缓解这个问题,我修改了我的getConn
函数,在连接之前对服务器进行了ping操作,以确保连接是“新鲜”的。
func (requester *Requester) getConn(ctx context.Context) (*sql.Conn, error) {
// 首先,尝试ping服务器以确保连接正常
// 如果失败,则返回错误
if err := requester.conn.PingContext(ctx); err != nil {
return nil, err
}
// 创建一个对象来指定重试的方式和时间
// 我们目前希望指数回退,最多重试5次
repeater := backoff.WithContext(backoff.WithMaxRetries(
backoff.NewExponentialBackOff(), 5), ctx)
// 使用500ms的等待时间和最多5次重试进行重试操作
// 并返回操作的结果
var conn *sql.Conn
if err := backoff.Retry(func() error {
// 尝试获取与数据库的连接
var err error
if conn, err = requester.conn.Conn(ctx); err != nil {
// 获取连接失败;如果是登录错误、EOF或握手失败,则稍后再次尝试连接,只需返回错误并让回退代码处理
log.Printf("连接失败,错误:%v", err)
if isLoginError(err, requester.serverName, requester.databaseName) {
return err
} else if strings.Contains(err.Error(), "EOF") {
return err
} else if strings.Contains(err.Error(), "TLS Handshake failed") {
return err
}
// 否则,无法从错误中恢复,所以返回错误
return backoff.Permanent(err)
}
return nil
}, repeater); err != nil {
return nil, err
}
return conn, nil
}
英文:
After some investigation, I discovered that the database connection grows stale after a 30 minute window, and modifying the lifetime or idle time of the connection pool doesn't really do anything to fix that. So, what I did to alleviate this problem was to modify my getConn
function to ping the server beforehand so I could ensure that the connection is "fresh", for lack of a better term.
func (requester *Requester) getConn(ctx context.Context) (*sql.Conn, error) {
// First, attempt to ping the server to ensure that the connection is good
// If this fails, then return an error
if err := requester.conn.PingContext(ctx); err != nil {
return nil, err
}
// Create an object that will dictate how and when the retries are done
// We currently want an exponential backoff that retries a maximum of 5 times
repeater := backoff.WithContext(backoff.WithMaxRetries(
backoff.NewExponentialBackOff(), 5), ctx)
// Do a retry operation with a 500ms wait time and a maximum of 5 retries
// and return the result of the operation therein
var conn *sql.Conn
if err := backoff.Retry(func() error {
// Attempt to get the connection to the database
var err error
if conn, err = requester.conn.Conn(ctx); err != nil {
// We failed to get the connection; if we have a login error, an EOF or handshake
// failure then we'll attempt the connection again later so just return it and let
// the backoff code handle it
log.Printf("Conn failed, error: %v", err)
if isLoginError(err, requester.serverName, requester.databaseName) {
return err
} else if strings.Contains(err.Error(), "EOF") {
return err
} else if strings.Contains(err.Error(), "TLS Handshake failed") {
return err
}
// Otherwise, we can't recover from the error so return it
return backoff.Permanent(err)
}
return nil
}, repeater); err != nil {
return nil, err
}
return conn, nil
}
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论