Go的网络爬虫在检查了大约2000个URL后停止运行。

huangapple go评论91阅读模式
英文:

Go webcrawler hangs after checking about 2000 urls

问题

我有一个程序,用于检查关键字是否存在于网页上。但是在检查1000-3000个URL之后,程序会卡住。没有输出,也没有退出,而且TCP连接数为零。我不知道为什么没有新的连接。

你能给我一些建议如何调试它吗?

type requestReturn struct {    
    url    string    
    status bool
}

var timeout = time.Duration(800 * time.Millisecond)    

func checkUrls(urls []string, kws string, threadLimit int) []string {    
    limitChan := make(chan int, threadLimit)    
    ok := make(chan requestReturn, 1)    
    var result []string    
    i := 0    
    for ; i < threadLimit; i++ {    
        go func(u string) {    
            request(u, limitChan, ok, kws)    
        }(urls[i])    
    }    
    for o := range ok {    
        if o.status {    
            result = append(result, o.url)    
            log.Printf("success %s,remain %d", o.url, len(urls)-i)    
        } else {    
            log.Printf("fail %s,remain %d", o.url, len(urls)-i)    
        }    
        if i < len(urls) {    
            go func(u string) {    
                request(u, limitChan, ok, kws)    
            }(urls[i])    
            i++    
        }    
    }    
    close(limitChan)    
    return result    
}    

func dialTimeout(network, addr string) (net.Conn, error) {    
    return net.DialTimeout(network, addr, timeout)    
}    

func request(url string, threadLimit chan int, ok chan requestReturn, kws string) {    
    threadLimit <- 1    
    log.Printf("%s, start...", url)    
    //startTime := time.Now().UnixNano()    
    rr := requestReturn{url: url}    

    transport := http.Transport{    
        Dial:              dialTimeout,    
        DisableKeepAlives: true,    
    }    

    client := http.Client{    
        Transport: &transport,    
        Timeout:   time.Duration(15 * time.Second),    
    }    

    resp, e := client.Get(url)    
    if e != nil {    
        log.Printf("%q", e)    
        rr.status = false    
        return    
    }    

    if resp.StatusCode == 200 {    
        body, err := ioutil.ReadAll(resp.Body)    
        if err != nil {    
            log.Printf("%q", err)    
            rr.status = false    
            return    
        }    

        content := bytes.NewBuffer(body).String()    

        matched, err1 := regexp.MatchString(kws, content)    
        if err1 != nil {    
            log.Printf("%q", err1)    
            rr.status = false    
        } else if matched {    
            rr.status = true    
            log.Println(rr.url)    
        } else {    
            rr.status = false    
        }    
    } else {    
        rr.status = false    
    }    

    defer (func() {    
        resp.Body.Close()    
        ok <- rr    
        //processed := float32(time.Now().UnixNano()-startTime) / 1e9    
        //log.Printf("%s, status:%t,time:%.3fs", rr.url, rr.status, processed)    
        <-threadLimit    
    })()    
}

以上是你要翻译的内容。

英文:

I have a program to check whether keywords are on a web page. But after checking 1000-3000 urls, it hangs. There is no output, it does not exit, and the number of tcp connections is zero. I don't know why there are no new connections.

Would you give me some advice how to debug it?

type requestReturn struct {    
url    string    
status bool
}
var timeout = time.Duration(800 * time.Millisecond)    
func checkUrls(urls []string, kws string, threadLimit int) []string {    
limitChan := make(chan int, threadLimit)    
ok := make(chan requestReturn, 1)    
var result []string    
i := 0    
for ; i &lt; threadLimit; i++ {    
go func(u string) {    
request(u, limitChan, ok, kws)    
}(urls[i])    
}    
for o := range ok {    
if o.status {    
result = append(result, o.url)    
log.Printf(&quot;success %s,remain %d&quot;, o.url, len(urls)-i)    
} else {    
log.Printf(&quot;fail %s,remain %d&quot;, o.url, len(urls)-i)    
}    
if i &lt; len(urls) {    
go func(u string) {    
request(u, limitChan, ok, kws)    
}(urls[i])    
i++    
}    
}    
close(limitChan)    
return result    
}    
func dialTimeout(network, addr string) (net.Conn, error) {    
return net.DialTimeout(network, addr, timeout)    
}    
func request(url string, threadLimit chan int, ok chan requestReturn, kws string) {    
threadLimit &lt;- 1    
log.Printf(&quot;%s, start...&quot;, url)    
//startTime := time.Now().UnixNano()    
rr := requestReturn{url: url}    
transport := http.Transport{    
Dial:              dialTimeout,    
DisableKeepAlives: true,    
}    
client := http.Client{    
Transport: &amp;transport,    
Timeout:   time.Duration(15 * time.Second),    
}    
resp, e := client.Get(url)    
if e != nil {    
log.Printf(&quot;%q&quot;, e)    
rr.status = false    
return    
}    
if resp.StatusCode == 200 {    
body, err := ioutil.ReadAll(resp.Body)    
if err != nil {    
log.Printf(&quot;%q&quot;, err)    
rr.status = false    
return    
}    
content := bytes.NewBuffer(body).String()    
matched, err1 := regexp.MatchString(kws, content)    
if err1 != nil {    
log.Printf(&quot;%q&quot;, err1)    
rr.status = false    
} else if matched {    
rr.status = true    
log.Println(rr.url)    
} else {    
rr.status = false    
}    
} else {    
rr.status = false    
}    
defer (func() {    
resp.Body.Close()    
ok &lt;- rr    
//processed := float32(time.Now().UnixNano()-startTime) / 1e9    
//log.Printf(&quot;%s, status:%t,time:%.3fs&quot;, rr.url, rr.status, processed)    
&lt;-threadLimit    
})()    
}

答案1

得分: 3

你在这段代码中似乎使用了两种形式的并发控制,并且两者都存在问题。

首先是limitChan,它看起来像是被用作信号量(request函数在开始时向其发送一个值,在defer中接收一个值)。但是checkUrls函数也试图确保同时只有threadLimit个goroutine在运行(通过一开始生成这个数量的goroutine,并且只有在其中一个在ok通道上报告结果后才生成更多)。实际上,只需要其中一种方法来限制并发。

这两种方法都存在问题,原因是request函数中defer语句的设置方式。在defer之前有多个return语句,因此函数有可能在不将结果发送到ok通道并释放limitChan中的槽位的情况下完成。在出现足够多的错误后,checkUrls将停止生成新的goroutine,从而导致程序挂起。

修复方法是将defer语句放在所有return语句之前,以确保它始终会被执行。可以像这样修改代码:

func request(url string, threadLimit chan int, ok chan requestReturn, kws string) {
    threadLimit <- 1
    rr := requestReturn{url: url}
    var resp *http.Response
    defer func() {
        if resp != nil {
            resp.Body.Close()
        }
        ok <- rr
        <-threadLimit
    }()
    ...
}
英文:

You seem to be using two forms of concurrency control in this code, and both have problems.

You've got limitChan, which looks like it is being used as a semaphore (request sends a value at its start, and receives a value in a defer in that function). But checkUrls is also trying to make sure it only has threadLimit goroutines running at once (by spawning that number first up, and only spawning more when one reports its results on the ok channel). Only one of these should be necessary to limit the concurrency.

Both methods fail due to the way the defer is set up in request. There are a number of return statements that occur before defer, so it is possible for the function to complete without sending the result to the ok channel, and without freeing up its slot in limitChan. After a sufficient number of errors, checkUrls will stop spawning new goroutines and you'll see your hang.

The fix is to place the defer statement before any of the return statements so you know it will always be run. Something like this:

func request(url string, threadLimit chan int, ok chan requestReturn, kws string) {
threadLimit &lt;- 1
rr := requestReturn{url: url}
var resp *http.Response
defer func() {
if resp != nil {
resp.Body.Close()
}
ok &lt;- rr
&lt;-threadLimit
}()
...
}

huangapple
  • 本文由 发表于 2014年4月21日 10:27:06
  • 转载请务必保留本文链接:https://go.coder-hub.com/23189927.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定