使用goroutine来改变样本吗?

huangapple go评论119阅读模式
英文:

Change the sample by using goroutine?

问题

我找到了一个很好的无效链接检查器的网页。但是如何使用goroutine将其改为完整的示例呢?网页链接是:如何在Golang中爬取网站。代码动态地将要搜索的URL添加到pending切片中,但我在使用goroutine方面遇到了一些困难。

  1. package main
  2. import (
  3. "crypto/tls"
  4. "errors"
  5. "fmt"
  6. "golang.org/x/net/html"
  7. "io"
  8. "net/http"
  9. "net/url"
  10. "strings"
  11. "time"
  12. )
  13. var alreadyCrawledList []string
  14. var pending []string
  15. var brokenLinks []string
  16. const localHostWithPort = "localhost:8080"
  17. func IsLinkInPendingQueue(link string) bool {
  18. for _, x := range pending {
  19. if x == link {
  20. return true
  21. }
  22. }
  23. return false
  24. }
  25. func IsLinkAlreadyCrawled(link string) bool {
  26. for _, x := range alreadyCrawledList {
  27. if x == link {
  28. return true
  29. }
  30. }
  31. return false
  32. }
  33. func AddLinkInAlreadyCrawledList(link string) {
  34. alreadyCrawledList = append(alreadyCrawledList, link)
  35. }
  36. func AddLinkInPendingQueue(link string) {
  37. pending = append(pending, link)
  38. }
  39. func AddLinkInBrokenLinksQueue(link string) {
  40. brokenLinks = append(brokenLinks, link)
  41. }
  42. func main() {
  43. start := time.Now()
  44. AddLinkInPendingQueue("http://" + localHostWithPort)
  45. for count := 0; len(pending) > 0; count++ {
  46. x := pending[0]
  47. pending = pending[1:] //它动态地改变了搜索的URL
  48. if err := crawlPage(x); err != nil { //如何使用goroutine?
  49. t.Errorf(err.Error())
  50. }
  51. }
  52. duration := time.Since(start)
  53. fmt.Println("________________")
  54. count = 0
  55. for _, l := range brokenLinks {
  56. count++
  57. fmt.Println(count, "Broken. | ", l)
  58. }
  59. fmt.Println("Time taken:", duration)
  60. }
  61. func crawlPage(uri string) error {
  62. if IsLinkAlreadyCrawled(uri) {
  63. fmt.Println("Already visited: Ignoring uri | ", uri)
  64. return nil
  65. }
  66. transport := &http.Transport{
  67. TLSClientConfig: &tls.Config{
  68. InsecureSkipVerify: true,
  69. },
  70. }
  71. client := http.Client{Transport: transport}
  72. resp, err := client.Get(uri)
  73. if err != nil {
  74. fmt.Println("Got error: ", err.Error())
  75. return err
  76. }
  77. if resp.StatusCode != http.StatusOK {
  78. AddLinkInBrokenLinksQueue(uri)
  79. return errors.New(fmt.Sprintf("Got %v instead of 200", resp.StatusCode))
  80. }
  81. defer resp.Body.Close()
  82. links := ParseLinks(resp.Body)
  83. links = ConvertLinksToLocalHost(links)
  84. for _, link := range links {
  85. if !InOurDomain(link) {
  86. continue
  87. }
  88. absolute := FixURL(link, uri)
  89. if !IsLinkAlreadyCrawled(absolute) && !IsLinkInPendingQueue(absolute) && absolute != uri { //不要两次将页面加入队列!
  90. AddLinkInPendingQueue(absolute)
  91. }
  92. }
  93. AddLinkInAlreadyCrawledList(uri)
  94. return nil
  95. }
  96. func InOurDomain(link string) bool {
  97. uri, err := url.Parse(link)
  98. if err != nil {
  99. return false
  100. }
  101. if uri.Scheme == "http" || uri.Scheme == "https" {
  102. if uri.Host == localHostWithPort {
  103. return true
  104. }
  105. return false
  106. }
  107. return true
  108. }
  109. func ConvertLinksToLocalHost(links []string) []string {
  110. var convertedLinks []string
  111. for _, link := range links {
  112. convertedLinks = append(convertedLinks, strings.Replace(link, "leantricks.com", localHostWithPort, 1))
  113. }
  114. return convertedLinks
  115. }
  116. func FixURL(href, base string) string {
  117. uri, err := url.Parse(href)
  118. if err != nil {
  119. return ""
  120. }
  121. baseURL, err := url.Parse(base)
  122. if err != nil {
  123. return ""
  124. }
  125. uri = baseURL.ResolveReference(uri)
  126. return uri.String()
  127. }
  128. func ParseLinks(httpBody io.Reader) []string {
  129. var links []string
  130. page := html.NewTokenizer(httpBody)
  131. for {
  132. tokenType := page.Next()
  133. if tokenType == html.ErrorToken {
  134. return links
  135. }
  136. token := page.Token()
  137. switch tokenType {
  138. case html.StartTagToken:
  139. fallthrough
  140. case html.SelfClosingTagToken:
  141. switch token.DataAtom.String() {
  142. case "a":
  143. fallthrough
  144. case "link":
  145. fallthrough
  146. case "script":
  147. for _, attr := range token.Attr {
  148. if attr.Key == "href" {
  149. links = append(links, attr.Val)
  150. }
  151. }
  152. }
  153. }
  154. }
  155. }
英文:

I found a good web invalid links checker. But how to change it for a complete sample by using goroutine? The web page is: How To Crawl A Website In Golang. The codes dynamically add the url that will be searched to the pending slice. but I have some difficulties to use goroutine to do it.

  1. package main
  2. import (
  3. "crypto/tls"
  4. "errors"
  5. "fmt"
  6. "golang.org/x/net/html"
  7. "io"
  8. "net/http"
  9. "net/url"
  10. "strings"
  11. "time"
  12. )
  13. var alreadyCrawledList []string
  14. var pending []string
  15. var brokenLinks []string
  16. const localHostWithPort = "localhost:8080"
  17. func IsLinkInPendingQueue(link string) bool {
  18. for _, x := range pending {
  19. if x == link {
  20. return true
  21. }
  22. }
  23. return false
  24. }
  25. func IsLinkAlreadyCrawled(link string) bool {
  26. for _, x := range alreadyCrawledList {
  27. if x == link {
  28. return true
  29. }
  30. }
  31. return false
  32. }
  33. func AddLinkInAlreadyCrawledList(link string) {
  34. alreadyCrawledList = append(alreadyCrawledList, link)
  35. }
  36. func AddLinkInPendingQueue(link string) {
  37. pending = append(pending, link)
  38. }
  39. func AddLinkInBrokenLinksQueue(link string) {
  40. brokenLinks = append(brokenLinks, link)
  41. }
  42. func main() {
  43. start := time.Now()
  44. AddLinkInPendingQueue("http://" + localHostWithPort)
  45. for count := 0; len(pending) > 0; count++ {
  46. x := pending[0]
  47. pending = pending[1:] //it dynamicly change the search url
  48. if err := crawlPage(x); err != nil { //how to use it by using goroutine?
  49. t.Errorf(err.Error())
  50. }
  51. }
  52. duration := time.Since(start)
  53. fmt.Println("________________")
  54. count = 0
  55. for _, l := range brokenLinks {
  56. count++
  57. fmt.Println(count, "Broken. | ", l)
  58. }
  59. fmt.Println("Time taken:", duration)
  60. }
  61. func crawlPage(uri string) error {
  62. if IsLinkAlreadyCrawled(uri) {
  63. fmt.Println("Already visited: Ignoring uri | ", uri)
  64. return nil
  65. }
  66. transport := &http.Transport{
  67. TLSClientConfig: &tls.Config{
  68. InsecureSkipVerify: true,
  69. },
  70. }
  71. client := http.Client{Transport: transport}
  72. resp, err := client.Get(uri)
  73. if err != nil {
  74. fmt.Println("Got error: ", err.Error())
  75. return err
  76. }
  77. if resp.StatusCode != http.StatusOK {
  78. AddLinkInBrokenLinksQueue(uri)
  79. return errors.New(fmt.Sprintf("Got %v instead of 200", resp.StatusCode))
  80. }
  81. defer resp.Body.Close()
  82. links := ParseLinks(resp.Body)
  83. links = ConvertLinksToLocalHost(links)
  84. for _, link := range links {
  85. if !InOurDomain(link) {
  86. continue
  87. }
  88. absolute := FixURL(link, uri)
  89. if !IsLinkAlreadyCrawled(absolute) && !IsLinkInPendingQueue(absolute) && absolute != uri { // Don't enqueue a page twice!
  90. AddLinkInPendingQueue(absolute)
  91. }
  92. }
  93. AddLinkInAlreadyCrawledList(uri)
  94. return nil
  95. }
  96. func InOurDomain(link string) bool {
  97. uri, err := url.Parse(link)
  98. if err != nil {
  99. return false
  100. }
  101. if uri.Scheme == "http" || uri.Scheme == "https" {
  102. if uri.Host == localHostWithPort {
  103. return true
  104. }
  105. return false
  106. }
  107. return true
  108. }
  109. func ConvertLinksToLocalHost(links []string) []string {
  110. var convertedLinks []string
  111. for _, link := range links {
  112. convertedLinks = append(convertedLinks, strings.Replace(link, "leantricks.com", localHostWithPort, 1))
  113. }
  114. return convertedLinks
  115. }
  116. func FixURL(href, base string) string {
  117. uri, err := url.Parse(href)
  118. if err != nil {
  119. return ""
  120. }
  121. baseURL, err := url.Parse(base)
  122. if err != nil {
  123. return ""
  124. }
  125. uri = baseURL.ResolveReference(uri)
  126. return uri.String()
  127. }
  128. func ParseLinks(httpBody io.Reader) []string {
  129. var links []string
  130. page := html.NewTokenizer(httpBody)
  131. for {
  132. tokenType := page.Next()
  133. if tokenType == html.ErrorToken {
  134. return links
  135. }
  136. token := page.Token()
  137. switch tokenType {
  138. case html.StartTagToken:
  139. fallthrough
  140. case html.SelfClosingTagToken:
  141. switch token.DataAtom.String() {
  142. case "a":
  143. fallthrough
  144. case "link":
  145. fallthrough
  146. case "script":
  147. for _, attr := range token.Attr {
  148. if attr.Key == "href" {
  149. links = append(links, attr.Val)
  150. }
  151. }
  152. }
  153. }
  154. }
  155. }

答案1

得分: 1

你可以同时调用crawlPage()函数,并使用互斥锁处理alreadyCrawledListpendingbrokenLinks变量(尽管性能可能不太好)。另一方面,为了提高性能,需要对代码进行大量修改。

我用4个链接进行了快速检查,似乎时间减少了一半。我编写了一个简单的HTTP服务器的示例代码,你可以在这里找到。

谢谢,

  • Anoop
英文:

You could invoke the crawlPage() concurrently and handle alreadyCrawledList, pending and brokenLinks variables with mutexes (not so performant though). On the other hand, the code needs to be modified a lot to get it more performant.

I did a quick check with 4 links and seems to half the duration. I did a sample code with a simple http server and its here

Thanks,

  • Anoop

huangapple
  • 本文由 发表于 2017年2月22日 10:07:13
  • 转载请务必保留本文链接:https://go.coder-hub.com/42381426.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定