英文:
Go webcrawler hangs after checking about 2000 urls
问题
我有一个程序,用于检查关键字是否存在于网页上。但是在检查1000-3000个URL之后,程序会卡住。没有输出,也没有退出,而且TCP连接数为零。我不知道为什么没有新的连接。
你能给我一些建议如何调试它吗?
type requestReturn struct {
url string
status bool
}
var timeout = time.Duration(800 * time.Millisecond)
func checkUrls(urls []string, kws string, threadLimit int) []string {
limitChan := make(chan int, threadLimit)
ok := make(chan requestReturn, 1)
var result []string
i := 0
for ; i < threadLimit; i++ {
go func(u string) {
request(u, limitChan, ok, kws)
}(urls[i])
}
for o := range ok {
if o.status {
result = append(result, o.url)
log.Printf("success %s,remain %d", o.url, len(urls)-i)
} else {
log.Printf("fail %s,remain %d", o.url, len(urls)-i)
}
if i < len(urls) {
go func(u string) {
request(u, limitChan, ok, kws)
}(urls[i])
i++
}
}
close(limitChan)
return result
}
func dialTimeout(network, addr string) (net.Conn, error) {
return net.DialTimeout(network, addr, timeout)
}
func request(url string, threadLimit chan int, ok chan requestReturn, kws string) {
threadLimit <- 1
log.Printf("%s, start...", url)
//startTime := time.Now().UnixNano()
rr := requestReturn{url: url}
transport := http.Transport{
Dial: dialTimeout,
DisableKeepAlives: true,
}
client := http.Client{
Transport: &transport,
Timeout: time.Duration(15 * time.Second),
}
resp, e := client.Get(url)
if e != nil {
log.Printf("%q", e)
rr.status = false
return
}
if resp.StatusCode == 200 {
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Printf("%q", err)
rr.status = false
return
}
content := bytes.NewBuffer(body).String()
matched, err1 := regexp.MatchString(kws, content)
if err1 != nil {
log.Printf("%q", err1)
rr.status = false
} else if matched {
rr.status = true
log.Println(rr.url)
} else {
rr.status = false
}
} else {
rr.status = false
}
defer (func() {
resp.Body.Close()
ok <- rr
//processed := float32(time.Now().UnixNano()-startTime) / 1e9
//log.Printf("%s, status:%t,time:%.3fs", rr.url, rr.status, processed)
<-threadLimit
})()
}
以上是你要翻译的内容。
英文:
I have a program to check whether keywords are on a web page. But after checking 1000-3000 urls, it hangs. There is no output, it does not exit, and the number of tcp connections is zero. I don't know why there are no new connections.
Would you give me some advice how to debug it?
type requestReturn struct {
url string
status bool
}
var timeout = time.Duration(800 * time.Millisecond)
func checkUrls(urls []string, kws string, threadLimit int) []string {
limitChan := make(chan int, threadLimit)
ok := make(chan requestReturn, 1)
var result []string
i := 0
for ; i < threadLimit; i++ {
go func(u string) {
request(u, limitChan, ok, kws)
}(urls[i])
}
for o := range ok {
if o.status {
result = append(result, o.url)
log.Printf("success %s,remain %d", o.url, len(urls)-i)
} else {
log.Printf("fail %s,remain %d", o.url, len(urls)-i)
}
if i < len(urls) {
go func(u string) {
request(u, limitChan, ok, kws)
}(urls[i])
i++
}
}
close(limitChan)
return result
}
func dialTimeout(network, addr string) (net.Conn, error) {
return net.DialTimeout(network, addr, timeout)
}
func request(url string, threadLimit chan int, ok chan requestReturn, kws string) {
threadLimit <- 1
log.Printf("%s, start...", url)
//startTime := time.Now().UnixNano()
rr := requestReturn{url: url}
transport := http.Transport{
Dial: dialTimeout,
DisableKeepAlives: true,
}
client := http.Client{
Transport: &transport,
Timeout: time.Duration(15 * time.Second),
}
resp, e := client.Get(url)
if e != nil {
log.Printf("%q", e)
rr.status = false
return
}
if resp.StatusCode == 200 {
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Printf("%q", err)
rr.status = false
return
}
content := bytes.NewBuffer(body).String()
matched, err1 := regexp.MatchString(kws, content)
if err1 != nil {
log.Printf("%q", err1)
rr.status = false
} else if matched {
rr.status = true
log.Println(rr.url)
} else {
rr.status = false
}
} else {
rr.status = false
}
defer (func() {
resp.Body.Close()
ok <- rr
//processed := float32(time.Now().UnixNano()-startTime) / 1e9
//log.Printf("%s, status:%t,time:%.3fs", rr.url, rr.status, processed)
<-threadLimit
})()
}
答案1
得分: 3
你在这段代码中似乎使用了两种形式的并发控制,并且两者都存在问题。
首先是limitChan
,它看起来像是被用作信号量(request
函数在开始时向其发送一个值,在defer
中接收一个值)。但是checkUrls
函数也试图确保同时只有threadLimit
个goroutine在运行(通过一开始生成这个数量的goroutine,并且只有在其中一个在ok
通道上报告结果后才生成更多)。实际上,只需要其中一种方法来限制并发。
这两种方法都存在问题,原因是request
函数中defer
语句的设置方式。在defer
之前有多个return
语句,因此函数有可能在不将结果发送到ok
通道并释放limitChan
中的槽位的情况下完成。在出现足够多的错误后,checkUrls
将停止生成新的goroutine,从而导致程序挂起。
修复方法是将defer
语句放在所有return
语句之前,以确保它始终会被执行。可以像这样修改代码:
func request(url string, threadLimit chan int, ok chan requestReturn, kws string) {
threadLimit <- 1
rr := requestReturn{url: url}
var resp *http.Response
defer func() {
if resp != nil {
resp.Body.Close()
}
ok <- rr
<-threadLimit
}()
...
}
英文:
You seem to be using two forms of concurrency control in this code, and both have problems.
You've got limitChan
, which looks like it is being used as a semaphore (request
sends a value at its start, and receives a value in a defer
in that function). But checkUrls
is also trying to make sure it only has threadLimit
goroutines running at once (by spawning that number first up, and only spawning more when one reports its results on the ok
channel). Only one of these should be necessary to limit the concurrency.
Both methods fail due to the way the defer
is set up in request
. There are a number of return
statements that occur before defer
, so it is possible for the function to complete without sending the result to the ok
channel, and without freeing up its slot in limitChan
. After a sufficient number of errors, checkUrls
will stop spawning new goroutines and you'll see your hang.
The fix is to place the defer
statement before any of the return
statements so you know it will always be run. Something like this:
func request(url string, threadLimit chan int, ok chan requestReturn, kws string) {
threadLimit <- 1
rr := requestReturn{url: url}
var resp *http.Response
defer func() {
if resp != nil {
resp.Body.Close()
}
ok <- rr
<-threadLimit
}()
...
}
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论