英文:
Tour of Go exercise #10: Crawler
问题
func Crawl(url string, depth int, fetcher Fetcher, ch chan string) {
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
ch <- fmt.Sprintln(err)
return
}
ch <- fmt.Sprintf("found: %s %q\n", url, body)
for _, u := range urls {
go Crawl(u, depth-1, fetcher, ch)
}
}
func main() {
ch := make(chan string, 100)
go Crawl("http://golang.org/", 4, fetcher, ch)
for i := range ch {
fmt.Println(i)
}
}
英文:
I'm going through the Go Tour and I feel like I have a pretty good understanding of the language except for concurrency.
slide 10 is an exercise that asks the reader to parallelize a web crawler (and to make it not cover repeats but I haven't gotten there yet.)
Here is what I have so far:
func Crawl(url string, depth int, fetcher Fetcher, ch chan string) {
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
ch <- fmt.Sprintln(err)
return
}
ch <- fmt.Sprintf("found: %s %q\n", url, body)
for _, u := range urls {
go Crawl(u, depth-1, fetcher, ch)
}
}
func main() {
ch := make(chan string, 100)
go Crawl("http://golang.org/", 4, fetcher, ch)
for i := range ch {
fmt.Println(i)
}
}
My question is, where do I put the close(ch)
call.
If I put a defer close(ch)
somewhere in the Crawl
method, then the program ends up writing to a closed channel from one of the spawned goroutines, because the call to Crawl
will return before the spawned goroutines do.
If I omit the call to close(ch)
, as I demonstrate it, the program deadlocks in the main function ranging the channel because the channel is never closed when all goroutines has returned.
答案1
得分: 30
看一下Effective Go中的Parallelization部分可以得到解决方案的想法。实际上,你必须在函数的每个返回路径上关闭通道。实际上,这是defer语句的一个很好的用例:
func Crawl(url string, depth int, fetcher Fetcher, ret chan string) {
defer close(ret)
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
ret <- err.Error()
return
}
ret <- fmt.Sprintf("found: %s %q", url, body)
result := make([]chan string, len(urls))
for i, u := range urls {
result[i] = make(chan string)
go Crawl(u, depth-1, fetcher, result[i])
}
for i := range result {
for s := range result[i] {
ret <- s
}
}
return
}
func main() {
result := make(chan string)
go Crawl("http://golang.org/", 4, fetcher, result)
for s := range result {
fmt.Println(s)
}
}
与你的代码的本质区别在于每个Crawl实例都有自己的返回通道,而调用函数在其返回通道中收集结果。
英文:
A look at the Parallelization section of Effective Go leads to ideas for the solution. Essentually you have to close the channel on each return route of the function. Actually this is a nice use case of the defer statement:
func Crawl(url string, depth int, fetcher Fetcher, ret chan string) {
defer close(ret)
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
ret <- err.Error()
return
}
ret <- fmt.Sprintf("found: %s %q", url, body)
result := make([]chan string, len(urls))
for i, u := range urls {
result[i] = make(chan string)
go Crawl(u, depth-1, fetcher, result[i])
}
for i := range result {
for s := range result[i] {
ret <- s
}
}
return
}
func main() {
result := make(chan string)
go Crawl("http://golang.org/", 4, fetcher, result)
for s := range result {
fmt.Println(s)
}
}
The essential difference to your code is that every instance of Crawl gets its own return channel and the caller function collects the results in its return channel.
答案2
得分: 23
// SafeUrlMap是可以并发使用的。
type SafeUrlMap struct {
v map[string]string
mux sync.Mutex
}
func (c *SafeUrlMap) Set(key string, body string) {
c.mux.Lock()
// 锁定以便一次只有一个goroutine可以访问映射c.v。
c.v[key] = body
c.mux.Unlock()
}
// Value返回给定键的映射值。
func (c *SafeUrlMap) Value(key string) (string, bool) {
c.mux.Lock()
// 锁定以便一次只有一个goroutine可以访问映射c.v。
defer c.mux.Unlock()
val, ok := c.v[key]
return val, ok
}
// Crawl使用fetcher递归爬取从url开始的页面,最大深度为depth。
func Crawl(url string, depth int, fetcher Fetcher, urlMap SafeUrlMap) {
defer wg.Done()
urlMap.Set(url, body)
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
for _, u := range urls {
if _, ok := urlMap.Value(u); !ok {
wg.Add(1)
go Crawl(u, depth-1, fetcher, urlMap)
}
}
return
}
var wg sync.WaitGroup
func main() {
urlMap := SafeUrlMap{v: make(map[string]string)}
wg.Add(1)
go Crawl("http://golang.org/", 4, fetcher, urlMap)
wg.Wait()
for url := range urlMap.v {
body, _ := urlMap.Value(url)
fmt.Printf("found: %s %q\n", url, body)
}
}
英文:
I went with a completely different direction with this one. I might have been mislead by the tip about using a map.
// SafeUrlMap is safe to use concurrently.
type SafeUrlMap struct {
v map[string]string
mux sync.Mutex
}
func (c *SafeUrlMap) Set(key string, body string) {
c.mux.Lock()
// Lock so only one goroutine at a time can access the map c.v.
c.v[key] = body
c.mux.Unlock()
}
// Value returns mapped value for the given key.
func (c *SafeUrlMap) Value(key string) (string, bool) {
c.mux.Lock()
// Lock so only one goroutine at a time can access the map c.v.
defer c.mux.Unlock()
val, ok := c.v[key]
return val, ok
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, urlMap SafeUrlMap) {
defer wg.Done()
urlMap.Set(url, body)
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
for _, u := range urls {
if _, ok := urlMap.Value(u); !ok {
wg.Add(1)
go Crawl(u, depth-1, fetcher, urlMap)
}
}
return
}
var wg sync.WaitGroup
func main() {
urlMap := SafeUrlMap{v: make(map[string]string)}
wg.Add(1)
go Crawl("http://golang.org/", 4, fetcher, urlMap)
wg.Wait()
for url := range urlMap.v {
body, _ := urlMap.Value(url)
fmt.Printf("found: %s %q\n", url, body)
}
}
答案3
得分: 10
O(1)时间在地图上查找URL,而不是在所有访问过的URL切片上进行O(n)查找,应该有助于最小化在关键部分内花费的时间,这在这个例子中只是微不足道的时间,但在规模上会变得相关。
WaitGroup用于防止顶级Crawl()函数在所有子go例程完成之前返回。
func Crawl(url string, depth int, fetcher Fetcher) {
var str_map = make(map[string]bool)
var mux sync.Mutex
var wg sync.WaitGroup
var crawler func(string,int)
crawler = func(url string, depth int) {
defer wg.Done()
if depth <= 0 {
return
}
mux.Lock()
if _, ok := str_map; ok {
mux.Unlock()
return;
}else{
str_map = true
mux.Unlock()
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q %q\n", url, body, urls)
for _, u := range urls {
wg.Add(1)
go crawler(u, depth-1)
}
}
wg.Add(1)
crawler(url,depth)
wg.Wait()
}
func main() {
Crawl("http://golang.org/", 4, fetcher)
}
英文:
O(1) time lookup of url on map instead of O(n) lookup on slice of all urls visited should help minimize time spent inside of the critical section, which is a trivial amount of time for this example but would become relevant with scale.
WaitGroup used to prevent top level Crawl() function from returning until all child go routines are complete.
func Crawl(url string, depth int, fetcher Fetcher) {
var str_map = make(map[string]bool)
var mux sync.Mutex
var wg sync.WaitGroup
var crawler func(string,int)
crawler = func(url string, depth int) {
defer wg.Done()
if depth <= 0 {
return
}
mux.Lock()
if _, ok := str_map; ok {
mux.Unlock()
return;
}else{
str_map = true
mux.Unlock()
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q %q\n", url, body, urls)
for _, u := range urls {
wg.Add(1)
go crawler(u, depth-1)
}
}
wg.Add(1)
crawler(url,depth)
wg.Wait()
}
func main() {
Crawl("http://golang.org/", 4, fetcher)
}
答案4
得分: 8
与接受的答案类似的想法,但没有获取重复的URL,并直接打印到控制台。也没有使用defer()。我们使用通道来信号化goroutine何时完成。SafeMap的想法是从之前在教程中给出的SafeCounter中提取的。
对于子goroutine,我们创建了一个通道数组,并等待每个子goroutine返回,通过等待通道。
package main
import (
"fmt"
"sync"
)
// SafeMap is safe to use concurrently.
type SafeMap struct {
v map[string]bool
mux sync.Mutex
}
// SetVal sets the value for the given key.
func (m *SafeMap) SetVal(key string, val bool) {
m.mux.Lock()
// Lock so only one goroutine at a time can access the map c.v.
m.v[key] = val
m.mux.Unlock()
}
// Value returns the current value of the counter for the given key.
func (m *SafeMap) GetVal(key string) bool {
m.mux.Lock()
// Lock so only one goroutine at a time can access the map c.v.
defer m.mux.Unlock()
return m.v[key]
}
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, status chan bool, urlMap SafeMap) {
// Check if we fetched this url previously.
if ok := urlMap.GetVal(url); ok {
//fmt.Println("Already fetched url!")
status <- true
return
}
// Marking this url as fetched already.
urlMap.SetVal(url, true)
if depth <= 0 {
status <- false
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
status <- false
return
}
fmt.Printf("found: %s %q\n", url, body)
statuses := make([]chan bool, len(urls))
for index, u := range urls {
statuses[index] = make(chan bool)
go Crawl(u, depth-1, fetcher, statuses[index], urlMap)
}
// Wait for child goroutines.
for _, childstatus := range statuses {
<-childstatus
}
// And now this goroutine can finish.
status <- true
return
}
func main() {
urlMap := SafeMap{v: make(map[string]bool)}
status := make(chan bool)
go Crawl("https://golang.org/", 4, fetcher, status, urlMap)
<-status
}
英文:
Similar idea to the accepted answer, but with no duplicate URLs fetched, and printing directly to console. defer() is not used either. We use channels to signal when goroutines complete. The SafeMap idea is lifted off the SafeCounter given previously in the tour.
For the child goroutines, we create an array of channels, and wait until every child returns, by waiting on the channel.
package main
import (
"fmt"
"sync"
)
// SafeMap is safe to use concurrently.
type SafeMap struct {
v map[string] bool
mux sync.Mutex
}
// SetVal sets the value for the given key.
func (m *SafeMap) SetVal(key string, val bool) {
m.mux.Lock()
// Lock so only one goroutine at a time can access the map c.v.
m.v[key] = val
m.mux.Unlock()
}
// Value returns the current value of the counter for the given key.
func (m *SafeMap) GetVal(key string) bool {
m.mux.Lock()
// Lock so only one goroutine at a time can access the map c.v.
defer m.mux.Unlock()
return m.v[key]
}
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, status chan bool, urlMap SafeMap) {
// Check if we fetched this url previously.
if ok := urlMap.GetVal(url); ok {
//fmt.Println("Already fetched url!")
status <- true
return
}
// Marking this url as fetched already.
urlMap.SetVal(url, true)
if depth <= 0 {
status <- false
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
status <- false
return
}
fmt.Printf("found: %s %q\n", url, body)
statuses := make ([]chan bool, len(urls))
for index, u := range urls {
statuses[index] = make (chan bool)
go Crawl(u, depth-1, fetcher, statuses[index], urlMap)
}
// Wait for child goroutines.
for _, childstatus := range(statuses) {
<- childstatus
}
// And now this goroutine can finish.
status <- true
return
}
func main() {
urlMap := SafeMap{v: make(map[string] bool)}
status := make(chan bool)
go Crawl("https://golang.org/", 4, fetcher, status, urlMap)
<- status
}
答案5
得分: 3
我认为使用一个地图(与其他语言中使用集合的方式相同)和一个互斥锁是最简单的方法:
func Crawl(url string, depth int, fetcher Fetcher) {
mux.Lock()
defer mux.Unlock()
if depth <= 0 || IsVisited(url) {
return
}
visit[url] = true
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
//
go Crawl(u, depth-1, fetcher)
}
return
}
func IsVisited(s string) bool {
_, ok := visit[s]
return ok
}
var mux sync.Mutex
var visit = make(map[string]bool)
func main() {
Crawl("https://golang.org/", 4, fetcher)
time.Sleep(time.Second)
}
英文:
I think using a map (the same way we could use a set in other languages) and a mutex is the easiest approach:
func Crawl(url string, depth int, fetcher Fetcher) {
mux.Lock()
defer mux.Unlock()
if depth <= 0 || IsVisited(url) {
return
}
visit = true
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
//
go Crawl(u, depth-1, fetcher)
}
return
}
func IsVisited(s string) bool {
_, ok := visit展开收缩
return ok
}
var mux sync.Mutex
var visit = make(map[string]bool)
func main() {
Crawl("https://golang.org/", 4, fetcher)
time.Sleep(time.Second)
}
答案6
得分: 3
这是我的解决方案。我在安全缓存中使用空结构作为值,因为它们没有分配任何内存。我基于os whossname的解决方案进行了修改。
package main
import (
"fmt"
"sync"
)
type Fetcher interface {
// Fetch返回URL的正文和在该页面上找到的URL列表。
Fetch(url string) (body string, urls []string, err error)
}
type safeCache struct {
m map[string]struct{}
c sync.Mutex
}
func (s *safeCache) Get(key string) bool {
s.c.Lock()
defer s.c.Unlock()
if _,ok:=s.m[key];!ok{
return false
}
return true
}
func (s *safeCache) Set(key string) {
s.c.Lock()
s.m[key] = struct{}{}
s.c.Unlock()
return
}
// Crawl使用fetcher递归爬取从url开始的页面,最大深度为depth。
func Crawl(url string, depth int, fetcher Fetcher, cach safeCache) {
defer wg.Done()
// TODO: 并行获取URL。
// TODO: 不要重复获取相同的URL。
// 这个实现都没有做到:
cach.Set(url)
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
if found := cach.Get(u); !found{
wg.Add(1)
go Crawl(u, depth-1, fetcher, cach)
}
}
return
}
var wg sync.WaitGroup
func main() {
urlSafe := safeCache{m: make(map[string]struct{})}
wg.Add(1)
go Crawl("https://golang.org/", 4, fetcher, urlSafe)
wg.Wait()
}
// fakeFetcher是返回固定结果的Fetcher。
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher是一个填充了假数据的fakeFetcher。
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
英文:
Here is my solution. I use empty structs as values in the safe cache because they are not assigned any memory. I based it off os whossname's solution.
package main
import (
"fmt"
"sync"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
type safeCache struct {
m map[string]struct{}
c sync.Mutex
}
func (s *safeCache) Get(key string) bool {
s.c.Lock()
defer s.c.Unlock()
if _,ok:=s.m[key];!ok{
return false
}
return true
}
func (s *safeCache) Set(key string) {
s.c.Lock()
s.m[key] = struct{}{}
s.c.Unlock()
return
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, cach safeCache) {
defer wg.Done()
// TODO: Fetch URLs in parallel.
// TODO: Don't fetch the same URL twice.
// This implementation doesn't do either:
cach.Set(url)
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
if found := cach.Get(u); !found{
wg.Add(1)
go Crawl(u, depth-1, fetcher, cach)
}
}
return
}
var wg sync.WaitGroup
func main() {
urlSafe := safeCache{m: make(map[string]struct{})}
wg.Add(1)
go Crawl("https://golang.org/", 4, fetcher, urlSafe)
wg.Wait()
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
答案7
得分: 2
下面是我的解决方案。除了全局地图之外,我只需要更改Crawl
的内容。与其他解决方案一样,我使用了sync.Map
和sync.WaitGroup
。我已经屏蔽了重要部分。
var m sync.Map
// Crawl使用fetcher递归爬取
// 从url开始的页面,最大深度为depth。
func Crawl(url string, depth int, fetcher Fetcher) {
// 这个实现不做任何事情:
if depth <= 0 {
return
}
// 不要重复获取相同的URL。
/////////////////////////////////////
_, ok := m.LoadOrStore(url, url) //
if ok { //
return //
} //
/////////////////////////////////////
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("找到: %s %q\n", url, body)
// 并行获取URL。
/////////////////////////////////////
var wg sync.WaitGroup //
defer wg.Wait() //
for _, u := range urls { //
wg.Add(1) //
go func(u string) { //
defer wg.Done() //
Crawl(u, depth-1, fetcher) //
}(u) //
} //
/////////////////////////////////////
return
}
英文:
Below is my solution. Except the global map, I only had to change the contents of Crawl
. Like other solutions, I used sync.Map
and sync.WaitGroup
. I've blocked out the important parts.
var m sync.Map
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
// This implementation doesn't do either:
if depth <= 0 {
return
}
// Don't fetch the same URL twice.
/////////////////////////////////////
_, ok := m.LoadOrStore(url, url) //
if ok { //
return //
} //
/////////////////////////////////////
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
// Fetch URLs in parallel.
/////////////////////////////////////
var wg sync.WaitGroup //
defer wg.Wait() //
for _, u := range urls { //
wg.Add(1) //
go func(u string) { //
defer wg.Done() //
Crawl(u, depth-1, fetcher) //
}(u) //
} //
/////////////////////////////////////
return
}
答案8
得分: 1
这是我的解决方案。我有一个“主”例程,它监听一个URL通道,并在发现新的URL需要爬取时启动新的爬取例程(将爬取的URL放入通道中)。
我没有显式关闭通道,而是使用一个未完成爬取的goroutine计数器,当计数器为0时,程序退出,因为没有需要等待的内容。
func doCrawl(url string, fetcher Fetcher, results chan []string) {
body, urls, err := fetcher.Fetch(url)
results <- urls
if err != nil {
fmt.Println(err)
} else {
fmt.Printf("found: %s %q\n", url, body)
}
}
func Crawl(url string, depth int, fetcher Fetcher) {
results := make(chan []string)
crawled := make(map[string]bool)
go doCrawl(url, fetcher, results)
// 未完成爬取的goroutine计数器
toWait := 1
for urls := range results {
toWait--
for _, u := range urls {
if !crawled[u] {
crawled[u] = true
go doCrawl(u, fetcher, results)
toWait++
}
}
if toWait == 0 {
break
}
}
}
英文:
Here's my solution. I have a "master" routine that listens to a channel of urls and starts new crawling routine (which puts crawled urls into the channel) if it finds new urls to crawl.
Instead of explicitly closing the channel, I have a counter for unfinished crawling goroutines, and when the counter is 0, the program exits because it has nothing to wait for.
func doCrawl(url string, fetcher Fetcher, results chan []string) {
body, urls, err := fetcher.Fetch(url)
results <- urls
if err != nil {
fmt.Println(err)
} else {
fmt.Printf("found: %s %q\n", url, body)
}
}
func Crawl(url string, depth int, fetcher Fetcher) {
results := make(chan []string)
crawled := make(map[string]bool)
go doCrawl(url, fetcher, results)
// counter for unfinished crawling goroutines
toWait := 1
for urls := range results {
toWait--
for _, u := range urls {
if !crawled[u] {
crawled[u] = true
go doCrawl(u, fetcher, results)
toWait++
}
}
if toWait == 0 {
break
}
}
}
答案9
得分: 1
我已经使用一个简单的通道来实现它,所有的goroutine都会发送它们的消息。为了确保在没有更多的goroutine时关闭它,我使用了一个安全计数器,当计数器为0时关闭通道。
type Msg struct {
url string
body string
}
type SafeCounter struct {
v int
mux sync.Mutex
}
func (c *SafeCounter) inc() {
c.mux.Lock()
defer c.mux.Unlock()
c.v++
}
func (c *SafeCounter) dec(ch chan Msg) {
c.mux.Lock()
defer c.mux.Unlock()
c.v--
if c.v == 0 {
close(ch)
}
}
var goes SafeCounter = SafeCounter{v: 0}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, ch chan Msg) {
defer goes.dec(ch)
// TODO: Fetch URLs in parallel.
// TODO: Don't fetch the same URL twice.
// This implementation doesn't do either:
if depth <= 0 {
return
}
if !cache.existsAndRegister(url) {
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
ch <- Msg{url, body}
for _, u := range urls {
goes.inc()
go Crawl(u, depth-1, fetcher, ch)
}
}
return
}
func main() {
ch := make(chan Msg, 100)
goes.inc()
go Crawl("http://golang.org/", 4, fetcher, ch)
for m := range ch {
fmt.Printf("found: %s %q\n", m.url, m.body)
}
}
请注意,安全计数器必须在goroutine之外递增。
英文:
I have implemented it with a simple channel, where all the goroutines send their messages. To ensure that it is closed when there is no more goroutines I use a safe counter, that close the channel when the counter is 0.
type Msg struct {
url string
body string
}
type SafeCounter struct {
v int
mux sync.Mutex
}
func (c *SafeCounter) inc() {
c.mux.Lock()
defer c.mux.Unlock()
c.v++
}
func (c *SafeCounter) dec(ch chan Msg) {
c.mux.Lock()
defer c.mux.Unlock()
c.v--
if c.v == 0 {
close(ch)
}
}
var goes SafeCounter = SafeCounter{v: 0}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, ch chan Msg) {
defer goes.dec(ch)
// TODO: Fetch URLs in parallel.
// TODO: Don't fetch the same URL twice.
// This implementation doesn't do either:
if depth <= 0 {
return
}
if !cache.existsAndRegister(url) {
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
ch <- Msg{url, body}
for _, u := range urls {
goes.inc()
go Crawl(u, depth-1, fetcher, ch)
}
}
return
}
func main() {
ch := make(chan Msg, 100)
goes.inc()
go Crawl("http://golang.org/", 4, fetcher, ch)
for m := range ch {
fmt.Printf("found: %s %q\n", m.url, m.body)
}
}
Note that the safe counter must be incremented outside of the goroutine.
答案10
得分: 1
我将safeCounter和waitGroup传递给Crawl函数,并使用safeCounter跳过已访问的URL,使用waitGroup防止当前goroutine提前退出。
func Crawl(url string, depth int, fetcher Fetcher, c *SafeCounter, wg *sync.WaitGroup) {
defer wg.Done()
if depth <= 0 {
return
}
c.mux.Lock()
c.v++
c.mux.Unlock()
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
c.mux.Lock()
i := c.v[u]
c.mux.Unlock()
if i == 1 {
continue
}
wg.Add(1)
go Crawl(u, depth-1, fetcher, c, wg)
}
return
}
func main() {
c := SafeCounter{v: make(map[string]int)}
var wg sync.WaitGroup
wg.Add(1)
Crawl("https://golang.org/", 4, fetcher, &c, &wg)
wg.Wait()
}
英文:
I passed the safeCounter and waitGroup to the Crawl function and then use safeCounter to jump over urls that have been visited, waitGroup to prevent early exit out of the current goroutine.
func Crawl(url string, depth int, fetcher Fetcher, c *SafeCounter, wg *sync.WaitGroup) {
defer wg.Done()
if depth <= 0 {
return
}
c.mux.Lock()
c.v++
c.mux.Unlock()
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
c.mux.Lock()
i := c.v[u]
c.mux.Unlock()
if i == 1 {
continue
}
wg.Add(1)
go Crawl(u, depth-1, fetcher, c, wg)
}
return
}
func main() {
c := SafeCounter{v: make(map[string]int)}
var wg sync.WaitGroup
wg.Add(1)
Crawl("https://golang.org/", 4, fetcher, &c, &wg)
wg.Wait()
}
答案11
得分: 1
这是我根据@fasmat的答案所启发的版本 - 这个版本通过使用RWMutex来防止重复获取相同的URL。
type Cache struct {
data map[string]fakeResult
mux sync.RWMutex
}
var cache = Cache{data: make(map[string]fakeResult)}
//cache将新页面添加到全局缓存中
func (c *Cache) cache(url string) fakeResult {
c.mux.Lock()
body, urls, err := fetcher.Fetch(url)
if err != nil {
body = err.Error()
}
data := fakeResult{body, urls}
c.data[url] = data
c.mux.Unlock()
return data
}
//Visit访问给定URL的页面,并在需要时将其缓存起来
func (c *Cache) Visit(url string) (data fakeResult, alreadyCached bool) {
c.mux.RLock()
data, alreadyCached = c.data[url]
c.mux.RUnlock()
if !alreadyCached {
data = c.cache(url)
}
return data, alreadyCached
}
/*
Crawl从给定的URL开始,爬取所有可达的页面,并在给定的深度内进行爬取(由参数给出)。
使用给定的fetcher获取页面,并将其缓存到全局缓存中。
不断将新发现的页面发送到out通道中。
*/
func Crawl(url string, depth int, fetcher Fetcher, out chan string) {
defer close(out)
if depth <= 0 {
return
}
data, alreadyCached := cache.Visit(url)
if alreadyCached {
return
}
//将新发现的页面发送到out通道中
out <- fmt.Sprintf("found: %s %q", url, data.body)
//访问链接的页面
res := make([]chan string, len(data.urls))
for i, link := range data.urls {
res[i] = make(chan string)
go Crawl(link, depth-1, fetcher, res[i])
}
//将链接中新发现的页面发送到out通道中
for i := range res {
for s := range res[i] {
out <- s
}
}
}
func main() {
res := make(chan string)
go Crawl("https://golang.org/", 4, fetcher, res)
for page := range res {
fmt.Println(page)
}
}
除了不重复获取URL之外,这个解决方案不使用事先知道页面总数的事实(适用于任意数量的页面),也不通过计时器错误地限制/延长程序执行时间。
英文:
Here is my version (inspired by @fasmats answer) – this one prevents fetching the same URL twice by utilizing custom cache with RWMutex.
type Cache struct {
data map[string]fakeResult
mux sync.RWMutex
}
var cache = Cache{data: make(map[string]fakeResult)}
//cache adds new page to the global cache
func (c *Cache) cache(url string) fakeResult {
c.mux.Lock()
body, urls, err := fetcher.Fetch(url)
if err != nil {
body = err.Error()
}
data := fakeResult{body, urls}
c.data[url] = data
c.mux.Unlock()
return data
}
//Visit visites the page at given url and caches it if needec
func (c *Cache) Visit(url string) (data fakeResult, alreadyCached bool) {
c.mux.RLock()
data, alreadyCached = c.data[url]
c.mux.RUnlock()
if !alreadyCached {
data = c.cache(url)
}
return data, alreadyCached
}
/*
Crawl crawles all pages reachable from url and within the depth (given by args).
Fetches pages using given fetcher and caches them in the global cache.
Continously sends newly discovered pages to the out channel.
*/
func Crawl(url string, depth int, fetcher Fetcher, out chan string) {
defer close(out)
if depth <= 0 {
return
}
data, alreadyCached := cache.Visit(url)
if alreadyCached {
return
}
//send newly discovered page to out channel
out <- fmt.Sprintf("found: %s %q", url, data.body)
//visit linked pages
res := make([]chan string, len(data.urls))
for i, link := range data.urls {
res[i] = make(chan string)
go Crawl(link, depth-1, fetcher, res[i])
}
//send newly discovered pages from links to out channel
for i := range res {
for s := range res[i] {
out <- s
}
}
}
func main() {
res := make(chan string)
go Crawl("https://golang.org/", 4, fetcher, res)
for page := range res {
fmt.Println(page)
}
}
Aside from not fetching URLs twice, this solution doesn't use the fact of knowing the total number of pages in advance (works for any number of pages) and doesn't falsely limit/extend program execution time by timers.
答案12
得分: 1
我是新手,所以这个解决方案对我来说更像是习惯用法。它使用一个通道来存储所有的结果,一个通道来存储所有的爬取请求(尝试爬取特定的URL),以及一个等待组来跟踪完成情况。主要的Crawl调用作为爬取请求的分发器(同时处理去重),并作为待处理的爬取请求的跟踪器。
package main
import (
"fmt"
"sync"
)
type Fetcher interface {
// Fetch返回URL的正文和在该页面上找到的URL列表。
Fetch(url string) (body string, urls []string, err error)
}
type FetchResult struct {
url string
body string
err error
}
type CrawlRequest struct {
url string
depth int
}
type Crawler struct {
depth int
fetcher Fetcher
results chan FetchResult
crawlRequests chan CrawlRequest
urlReservations map[string]bool
waitGroup *sync.WaitGroup
}
func (crawler Crawler) Crawl(url string, depth int) {
defer crawler.waitGroup.Done()
if depth <= 0 {
return
}
body, urls, err := crawler.fetcher.Fetch(url)
crawler.results <- FetchResult{url, body, err}
if len(urls) == 0 {
return
}
crawler.waitGroup.Add(len(urls))
for _, url := range urls {
crawler.crawlRequests <- CrawlRequest{url, depth - 1}
}
}
// Crawl使用fetcher递归地爬取以url为起点的页面,最大深度为depth。
func Crawl(url string, depth int, fetcher Fetcher) (results chan FetchResult) {
results = make(chan FetchResult)
urlReservations := make(map[string]bool)
crawler := Crawler{
crawlRequests: make(chan CrawlRequest),
depth: depth,
fetcher: fetcher,
results: results,
waitGroup: &sync.WaitGroup{},
}
crawler.waitGroup.Add(1)
// 监听爬取请求,如果它们不是重复的,则将它们传递给调用者。
go func() {
for crawlRequest := range crawler.crawlRequests {
if _, isReserved := urlReservations[crawlRequest.url]; isReserved {
crawler.waitGroup.Done()
continue
}
urlReservations[crawlRequest.url] = true
go crawler.Crawl(crawlRequest.url, crawlRequest.depth)
}
}()
// 等待等待组完成,然后关闭通道。
go func() {
crawler.waitGroup.Wait()
close(results)
}()
// 将第一个爬取请求发送到通道中。
crawler.crawlRequests <- CrawlRequest{url, depth}
return
}
func main() {
results := Crawl("https://golang.org/", 4, fetcher)
for result := range results {
if result.err != nil {
fmt.Println(result.err)
continue
}
fmt.Printf("found: %s %q\n", result.url, result.body)
}
fmt.Printf("done!")
}
// fakeFetcher是一个返回固定结果的Fetcher。
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher是一个填充了数据的fakeFetcher。
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
英文:
I'm new to go, so grain of salt, but this solution seems to me like it'd be more idiomatic. It uses a single channel for all of the results, a single channel for all of the crawl requests (an attempt to crawl a specific url), and a wait group for keeping track of completion. The main Crawl call acts as the distributor of the crawl requests to worker processes (while handling deduplication) and acts as the tracker for how many crawl requests are pending.
package main
import (
"fmt"
"sync"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
type FetchResult struct {
url string
body string
err error
}
type CrawlRequest struct {
url string
depth int
}
type Crawler struct {
depth int
fetcher Fetcher
results chan FetchResult
crawlRequests chan CrawlRequest
urlReservations map[string]bool
waitGroup *sync.WaitGroup
}
func (crawler Crawler) Crawl(url string, depth int) {
defer crawler.waitGroup.Done()
if depth <= 0 {
return
}
body, urls, err := crawler.fetcher.Fetch(url)
crawler.results <- FetchResult{url, body, err}
if len(urls) == 0 {
return
}
crawler.waitGroup.Add(len(urls))
for _, url := range urls {
crawler.crawlRequests <- CrawlRequest{url, depth - 1}
}
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) (results chan FetchResult) {
results = make(chan FetchResult)
urlReservations := make(map[string]bool)
crawler := Crawler{
crawlRequests: make(chan CrawlRequest),
depth: depth,
fetcher: fetcher,
results: results,
waitGroup: &sync.WaitGroup{},
}
crawler.waitGroup.Add(1)
// Listen for crawlRequests, pass them through to the caller if they aren't duplicates.
go func() {
for crawlRequest := range crawler.crawlRequests {
if _, isReserved := urlReservations[crawlRequest.url]; isReserved {
crawler.waitGroup.Done()
continue
}
urlReservations[crawlRequest.url] = true
go crawler.Crawl(crawlRequest.url, crawlRequest.depth)
}
}()
// Wait for the wait group to finish, and then close the channel
go func() {
crawler.waitGroup.Wait()
close(results)
}()
// Send the first crawl request to the channel
crawler.crawlRequests <- CrawlRequest{url, depth}
return
}
func main() {
results := Crawl("https://golang.org/", 4, fetcher)
for result := range results {
if result.err != nil {
fmt.Println(result.err)
continue
}
fmt.Printf("found: %s %q\n", result.url, result.body)
}
fmt.Printf("done!")
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
答案13
得分: 1
这是我的解决方案。我遇到了一个问题,即主函数不等待goroutine打印它们的状态并完成。我检查了前一张幻灯片使用了在退出之前等待1秒的解决方案,并决定使用这种方法。不过,实际上,我认为一些协调机制更好。
import (
"fmt"
"sync"
"time"
)
type SafeMap struct {
mu sync.Mutex
v map[string]bool
}
// 将给定的键设置为true。
func (sm *SafeMap) Set(key string) {
sm.mu.Lock()
sm.v[key] = true
sm.mu.Unlock()
}
// Get返回给定键的当前值。
func (sm *SafeMap) Get(key string) bool {
sm.mu.Lock()
defer sm.mu.Unlock()
return sm.v[key]
}
var safeMap = SafeMap{v: make(map[string]bool)}
type Fetcher interface {
// Fetch返回URL的正文和在该页面上找到的URL的切片。
Fetch(url string) (body string, urls []string, err error)
}
// Crawl使用fetcher递归爬取以url开头的页面,最大深度为depth。
func Crawl(url string, depth int, fetcher Fetcher) {
if depth <= 0 {
return
}
// 如果值存在,则不要重复获取
if safeMap.Get(url) {
return
}
// 检查获取时是否有错误
body, urls, err := fetcher.Fetch(url)
safeMap.Set(url)
if err != nil {
fmt.Println(err)
return
}
// 列出内容并递归爬取
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
go Crawl(u, depth-1, fetcher)
}
}
func main() {
go Crawl("https://golang.org/", 4, fetcher)
time.Sleep(time.Second)
}
英文:
Here is my solution. I had a problem that the main function doesn't wait on the goroutines to print their statuses and finish. I checked that the previous slide used a solution with waiting of 1 second before exiting, and I decided to use that approach. In practice though, I believe some coordinating mechanism is better.
import (
"fmt"
"sync"
"time"
)
type SafeMap struct {
mu sync.Mutex
v map[string]bool
}
// Sets the given key to true.
func (sm *SafeMap) Set(key string) {
sm.mu.Lock()
sm.v[key] = true
sm.mu.Unlock()
}
// Get returns the current value for the given key.
func (sm *SafeMap) Get(key string) bool {
sm.mu.Lock()
defer sm.mu.Unlock()
return sm.v[key]
}
var safeMap = SafeMap{v: make(map[string]bool)}
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
if depth <= 0 {
return
}
// if the value exists, don't fetch it twice
if safeMap.Get(url) {
return
}
// check if there is an error fetching
body, urls, err := fetcher.Fetch(url)
safeMap.Set(url)
if err != nil {
fmt.Println(err)
return
}
// list contents and crawl recursively
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
go Crawl(u, depth-1, fetcher)
}
}
func main() {
go Crawl("https://golang.org/", 4, fetcher)
time.Sleep(time.Second)
}
答案14
得分: 1
不需要在全局范围内更改任何签名或引入任何新内容。我们可以使用sync.WaitGroup
来等待recurse
goroutine完成。从字符串到空结构的映射充当集合,并且是保持已爬取URL计数的最有效方法。
func Crawl(url string, depth int, fetcher Fetcher) {
visited := make(map[string]struct{})
var mu sync.Mutex
var wg sync.WaitGroup
var recurse func(string, int)
recurse = func(url string, depth int) {
defer wg.Done()
if depth <= 0 {
return
}
mu.Lock()
defer mu.Unlock()
if _, ok := visited[url]; ok {
return
}
visited[url] = struct{}{}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
wg.Add(1)
go recurse(u, depth-1)
}
}
wg.Add(1)
go recurse(url, depth)
wg.Wait()
}
func main() {
Crawl("https://golang.org/", 4, fetcher)
}
完整演示请参见Go Playground
英文:
No need to change any signatures or introduce any new stuff in the global scope. We can use a sync.WaitGroup
to wait for the recurse
goroutines to finish. A map
from a strings to empty structs acts as a set, and is the most efficient way to keep count of the already crawled URLs.
func Crawl(url string, depth int, fetcher Fetcher) {
visited := make(map[string]struct{})
var mu sync.Mutex
var wg sync.WaitGroup
var recurse func(string, int)
recurse = func(url string, depth int) {
defer wg.Done()
if depth <= 0 {
return
}
mu.Lock()
defer mu.Unlock()
if _, ok := visited; ok {
return
}
visited = struct{}{}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
wg.Add(1)
go recurse(u, depth-1)
}
}
wg.Add(1)
go recurse(url, depth)
wg.Wait()
}
func main() {
Crawl("https://golang.org/", 4, fetcher)
}
Full demo on the Go Playground
答案15
得分: 0
我使用切片来避免重复爬取URL,没有并发的递归版本是可以的,但是对于这个并发版本不确定。
func Crawl(url string, depth int, fetcher Fetcher) {
var str_arrs []string
var mux sync.Mutex
var crawl func(string, int)
crawl = func(url string, depth int) {
if depth <= 0 {
return
}
mux.Lock()
for _, v := range str_arrs {
if url == v {
mux.Unlock()
return
}
}
str_arrs = append(str_arrs, url)
mux.Unlock()
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
go crawl(u, depth-1) // 可以删除"go",这样就是递归
}
}
crawl(url, depth)
return
}
func main() {
Crawl("http://golang.org/", 4, fetcher)
}
英文:
I use slice to avoid crawl the url twice,the recursive version without the concurrency is ok, but not sure about this concurrency version.
func Crawl(url string, depth int, fetcher Fetcher) {
var str_arrs []string
var mux sync.Mutex
var crawl func(string, int)
crawl = func(url string, depth int) {
if depth <= 0 {
return
}
mux.Lock()
for _, v := range str_arrs {
if url == v {
mux.Unlock()
return
}
}
str_arrs = append(str_arrs, url)
mux.Unlock()
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
go crawl(u, depth-1) // could delete “go” then it is recursive
}
}
crawl(url, depth)
return
}
func main() {
Crawl("http://golang.org/", 4, fetcher)
}
答案16
得分: 0
这是我的解决方案,使用sync.WaitGroup和一个SafeCache来存储获取的URL:
package main
import (
"fmt"
"sync"
)
type Fetcher interface {
// Fetch返回URL的内容和在该页面上找到的URL的切片。
Fetch(url string) (body string, urls []string, err error)
}
// 可以并发使用
type SafeCache struct {
fetched map[string]string
mux sync.Mutex
}
func (c *SafeCache) Add(url, body string) {
c.mux.Lock()
defer c.mux.Unlock()
if _, ok := c.fetched; !ok {
c.fetched = body
}
}
func (c *SafeCache) Contains(url string) bool {
c.mux.Lock()
defer c.mux.Unlock()
_, ok := c.fetched
return ok
}
// Crawl使用fetcher递归爬取从url开始的页面,最大深度为depth。
func Crawl(url string, depth int, fetcher Fetcher, cache SafeCache,
wg *sync.WaitGroup) {
defer wg.Done()
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
cache.Add(url, body)
for _, u := range urls {
if !cache.Contains(u) {
wg.Add(1)
go Crawl(u, depth-1, fetcher, cache, wg)
}
}
return
}
func main() {
cache := SafeCache{fetched: make(map[string]string)}
var wg sync.WaitGroup
wg.Add(1)
Crawl("http://golang.org/", 4, fetcher, cache, &wg)
wg.Wait()
}
英文:
Here's my solution, using sync.WaitGroup and a SafeCache of fetched urls:
package main
import (
"fmt"
"sync"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// Safe to use concurrently
type SafeCache struct {
fetched map[string]string
mux sync.Mutex
}
func (c *SafeCache) Add(url, body string) {
c.mux.Lock()
defer c.mux.Unlock()
if _, ok := c.fetched; !ok {
c.fetched = body
}
}
func (c *SafeCache) Contains(url string) bool {
c.mux.Lock()
defer c.mux.Unlock()
_, ok := c.fetched
return ok
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, cache SafeCache,
wg *sync.WaitGroup) {
defer wg.Done()
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
cache.Add(url, body)
for _, u := range urls {
if !cache.Contains(u) {
wg.Add(1)
go Crawl(u, depth-1, fetcher, cache, wg)
}
}
return
}
func main() {
cache := SafeCache{fetched: make(map[string]string)}
var wg sync.WaitGroup
wg.Add(1)
Crawl("http://golang.org/", 4, fetcher, cache, &wg)
wg.Wait()
}
答案17
得分: 0
var fetchedUrlMap = make(map[string]bool)
var mutex sync.Mutex
func Crawl(url string, depth int, fetcher Fetcher) {
if _, ok := fetchedUrlMap
return
}
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
mutex.Lock()
fetchedUrlMap = true
mutex.Unlock()
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
var wg sync.WaitGroup
for _, u := range urls {
wg.Add(1)
go func(uv string) {
Crawl(uv, depth-1, fetcher)
wg.Done()
}(u)
}
wg.Wait()
}
英文:
Below is a simple solution for parallelization using only sync waitGroup.
<!-- begin snippet: js hide: false console: true babel: false -->
<!-- language: lang-html -->
var fetchedUrlMap = make(map[string]bool)
var mutex sync.Mutex
func Crawl(url string, depth int, fetcher Fetcher) {
//fmt.Println("In Crawl2 with url" , url)
if _, ok := fetchedUrlMap; ok {
return
}
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
mutex.Lock()
fetchedUrlMap = true
mutex.Unlock()
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
var wg sync.WaitGroup
for _, u := range urls {
// fmt.Println("Solving for ", u)
wg.Add(1)
go func(uv string) {
Crawl(uv, depth-1, fetcher)
wg.Done()
}(u)
}
wg.Wait()
}
<!-- end snippet -->
答案18
得分: 0
这是我的解决方案
package main
import (
"fmt"
"runtime"
"sync"
)
type Fetcher interface {
// Fetch返回URL的正文和在该页面上找到的URL的切片。
Fetch(url string) (body string, urls []string, err error)
}
// Crawl使用fetcher递归地爬取
// 从url开始的页面,最大深度为depth。
func Crawl(url string, depth int, fetcher Fetcher, set map[string]bool) {
// TODO:并行获取URL。
// TODO:不要重复获取相同的URL。
// 这个实现都没有做到:
if depth <= 0 {
return
}
// 使用一个集合来确定是否应该遍历URL
if set == true {
wg.Done()
return
} else {
fmt.Println(runtime.NumGoroutine())
set = true
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
Crawl(u, depth-1, fetcher, set)
}
}
}
var wg sync.WaitGroup
func main() {
wg.Add(6)
collectedURLs := make(map[string]bool)
go Crawl("https://golang.org/", 4, fetcher, collectedURLs)
wg.Wait()
}
// fakeFetcher是返回固定结果的Fetcher。
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher是一个填充了fakeFetcher的结构。
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
英文:
Here is my solution
package main
import (
"fmt"
"runtime"
"sync"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, set map[string]bool) {
// TODO: Fetch URLs in parallel.
// TODO: Don't fetch the same URL twice.
// This implementation doesn't do either:
if depth <= 0 {
return
}
// use a set to identify if the URL should be traversed or not
if set == true {
wg.Done()
return
} else {
fmt.Println(runtime.NumGoroutine())
set = true
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
Crawl(u, depth-1, fetcher, set)
}
}
}
var wg sync.WaitGroup
func main() {
wg.Add(6)
collectedURLs := make(map[string]bool)
go Crawl("https://golang.org/", 4, fetcher, collectedURLs)
wg.Wait()
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
答案19
得分: 0
由于这里大部分的解决方案对我来说都不适用(包括被接受的答案),我将上传我自己的解决方案,受到Kamil的启发(特别感谢 :))(没有重复/所有有效的URL)
package main
import (
"fmt"
"runtime"
"sync"
)
type Fetcher interface {
// Fetch返回URL的内容和在该页面上找到的URL切片。
Fetch(url string) (body string, urls []string, err error)
}
// Crawl使用fetcher递归地爬取以url开头的页面,最大深度为depth。
func Crawl(url string, depth int, fetcher Fetcher, set map[string]bool) {
// TODO: 并行获取URL。
// TODO: 不要重复获取相同的URL。
if depth <= 0 { return }
// 使用一个集合来判断URL是否应该被遍历
fmt.Println(runtime.NumGoroutine())
set[url] = true
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
if set[u] == false {
wg.Add(1)
go Crawl(u, depth-1, fetcher, set)
}
}
wg.Done()
}
var wg sync.WaitGroup
func main() {
collectedURLs := make(map[string]bool)
Crawl("https://golang.org/", 4, fetcher, collectedURLs)
wg.Wait()
}
英文:
Since most of the Solutions here don't work out for me (including accepted answer), I'll upload my own inspired by Kamil (special thanks (no dups/all valid URLs)
package main
import (
"fmt"
"runtime"
"sync"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, set map[string]bool) {
// TODO: Fetch URLs in parallel.
// TODO: Don't fetch the same URL twice.
if depth <= 0 { return }
// use a set to identify if the URL should be traversed or not
fmt.Println(runtime.NumGoroutine())
set = true
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
if set[u] == false {
wg.Add(1)
go Crawl(u, depth-1, fetcher, set)
}
}
wg.Done()
}
var wg sync.WaitGroup
func main() {
collectedURLs := make(map[string]bool)
Crawl("https://golang.org/", 4, fetcher, collectedURLs)
wg.Wait()
}
</details>
# 答案20
**得分**: 0
/*
练习:网络爬虫
在这个练习中,你将使用Go的并发特性来并行化一个网络爬虫。
修改Crawl函数,以便并行获取URL,而不会重复获取相同的URL。
提示:你可以在一个map中保留已获取的URL的缓存,但是仅仅使用map是不安全的!
*/
package main
import (
"fmt"
"sync"
"time"
)
type Fetcher interface {
// Fetch返回URL的内容和在该页面上找到的URL的切片。
Fetch(url string) (body string, urls []string, err error)
}
type Response struct {
url string
urls []string
body string
err error
}
var ch chan Response = make(chan Response)
var fetched map[string]bool = make(map[string]bool)
var wg sync.WaitGroup
var mu sync.Mutex
// Crawl使用fetcher递归地爬取以url为起点的页面,最大深度为depth。
func Crawl(url string, depth int, fetcher Fetcher) {
// TODO: 并行获取URL。
// TODO: 不要重复获取相同的URL。
// 这个实现没有做到以上两点:
var fetch func(url string, depth int, fetcher Fetcher)
wg.Add(1)
recv := func() {
for res := range ch {
body, _, err := res.body, res.urls, res.err
if err != nil {
fmt.Println(err)
continue
}
fmt.Printf("found: %s %q\n", url, body)
}
}
fetch = func(url string, depth int, fetcher Fetcher) {
time.Sleep(time.Second / 2)
defer wg.Done()
if depth <= 0 || fetched {
return
}
mu.Lock()
fetched = true
mu.Unlock()
body, urls, err := fetcher.Fetch(url)
for _, u := range urls {
wg.Add(1)
go fetch(u, depth-1, fetcher)
}
ch <- Response{url, urls, body, err}
}
go fetch(url, depth, fetcher)
go recv()
return
}
func main() {
Crawl("https://golang.org/", 4, fetcher)
wg.Wait()
}
// fakeFetcher是一个返回预定义结果的Fetcher。
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher是一个填充了预定义结果的fakeFetcher。
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd1/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd2/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
<details>
<summary>英文:</summary>
/*
Exercise: Web Crawler
In this exercise you'll use Go's concurrency features to parallelize a web crawler.
Modify the Crawl function to fetch URLs in parallel without fetching the same URL twice.
Hint: you can keep a cache of the URLs that have been fetched on a map, but maps alone are not safe for concurrent use!
*/
package main
import (
"fmt"
"sync"
"time"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
type Response struct {
url string
urls []string
body string
err error
}
var ch chan Response = make(chan Response)
var fetched map[string]bool = make(map[string]bool)
var wg sync.WaitGroup
var mu sync.Mutex
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
// TODO: Fetch URLs in parallel.
// TODO: Don't fetch the same URL twice.
// This implementation doesn't do either:
var fetch func(url string, depth int, fetcher Fetcher)
wg.Add(1)
recv := func() {
for res := range ch {
body, _, err := res.body, res.urls, res.err
if err != nil {
fmt.Println(err)
continue
}
fmt.Printf("found: %s %q\n", url, body)
}
}
fetch = func(url string, depth int, fetcher Fetcher) {
time.Sleep(time.Second / 2)
defer wg.Done()
if depth <= 0 || fetched {
return
}
mu.Lock()
fetched = true
mu.Unlock()
body, urls, err := fetcher.Fetch(url)
for _, u := range urls {
wg.Add(1)
go fetch(u, depth-1, fetcher)
}
ch <- Response{url, urls, body, err}
}
go fetch(url, depth, fetcher)
go recv()
return
}
func main() {
Crawl("https://golang.org/", 4, fetcher)
wg.Wait()
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd1/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd2/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
https://gist.github.com/gaogao1030/5d63ed925534f3610ccb7e25ed46992a
</details>
# 答案21
**得分**: 0
超级简单的解决方案,使用每个获取的URL的通道来等待Go协程在相应的主体中爬取URL。使用`UrlCache`结构体避免重复的URL,其中包含一个互斥锁和一个`map[string]struct{}`(相对于布尔值的映射可以节省内存)。
通过使用延迟锁定互斥锁和通道写入来减轻可能导致死锁的副作用。
```go
package main
import (
"fmt"
"sync"
)
type UrlCache struct {
v map[string]struct{}
mux sync.Mutex
}
func NewUrlCache() *UrlCache {
res := UrlCache{}
res.v = make(map[string]struct{})
return &res
}
func (c *UrlCache) check(url string) bool {
c.mux.Lock()
defer c.mux.Unlock()
if _, p := c.v; !p {
c.v = struct{}{}
return false
}
return true
}
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, uc *UrlCache, c chan struct{}) {
defer func() { c <- struct{}{} }()
if depth <= 0 {
return
}
if uc.check(url) {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
ci := make(chan struct{})
for _, u := range urls {
go Crawl(u, depth-1, fetcher, uc, ci)
}
// Wait for parallel crawl to finish
for range urls {
<-ci
}
}
func main() {
c := make(chan struct{})
go Crawl("https://golang.org/", 4, fetcher, NewUrlCache(), c)
<-c
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
英文:
Super-simple solution, using one channel per fetched URL to wait for GoRoutines crawling the URLs in the corresponding body.
Duplicate URLs are avoided using a UrlCache
struct with a mutex and a map[string]struct{}
(this saves memory wrt a map of booleans).
Side effects, potentially causing deadlocks, are mitigated by using defer for both mutex locking and channels writes.
package main
import (
"fmt"
"sync"
)
type UrlCache struct {
v map[string]struct{}
mux sync.Mutex
}
func NewUrlCache() *UrlCache {
res := UrlCache{}
res.v = make(map[string]struct{})
return &res
}
func (c *UrlCache) check(url string) bool {
c.mux.Lock()
defer c.mux.Unlock()
if _, p := c.v[url]; !p {
c.v[url] = struct{}{}
return false
}
return true
}
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, uc *UrlCache, c chan struct{}) {
defer func() { c <- struct{}{} }()
if depth <= 0 {
return
}
if uc.check(url) {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
ci := make(chan struct{})
for _, u := range urls {
go Crawl(u, depth-1, fetcher, uc, ci)
}
// Wait for parallel crowl to finish
for range urls {
<-ci
}
}
func main() {
c := make(chan struct{})
go Crawl("https://golang.org/", 4, fetcher, NewUrlCache(), c)
<-c
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
答案22
得分: 0
以下是我的解决方案。延迟是golang中一个非常强大的语义。
var urlcache FetchedUrls
func (urlcache *FetchedUrls) CacheIfNotPresent(url string) bool {
urlcache.m.Lock()
defer urlcache.m.Unlock()
_, ok := urlcache.urls
if !ok {
urlcache.urls = true
}
return !ok
}
func BlockOnChan(ch chan int) {
<-ch
}
// Crawl使用fetcher递归爬取从url开始的页面,最大深度为depth。
func Crawl(url string, depth int, fetcher Fetcher, ch chan int) {
defer close(ch)
if depth <= 0 {
return
}
if !urlcache.CacheIfNotPresent(url) {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
fch := make(chan int)
defer BlockOnChan(fch)
go Crawl(u, depth-1, fetcher, fch)
}
}
func main() {
urlcache.urls = make(map[string]bool)
Crawl("https://golang.org/", 4, fetcher, make(chan int))
}
英文:
Below is my solution. Defer is a really powerful semantic in golang.
var urlcache FetchedUrls
func (urlcache *FetchedUrls) CacheIfNotPresent(url string) bool {
urlcache.m.Lock()
defer urlcache.m.Unlock()
_, ok := urlcache.urls
if !ok {
urlcache.urls = true
}
return !ok
}
func BlockOnChan(ch chan int) {
<-ch
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, ch chan int) {
defer close(ch)
if depth <= 0 {
return
}
if !urlcache.CacheIfNotPresent(url) {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
fch := make(chan int)
defer BlockOnChan(fch)
go Crawl(u, depth-1, fetcher, fch)
}
}
func main() {
urlcache.urls = make(map[string]bool)
Crawl("https://golang.org/", 4, fetcher, make(chan int))
}
答案23
得分: 0
将我的解决方案添加供他人参考。希望能帮到你。
能够比较我们不同的方法真是太好了!
你可以在Go playground中尝试下面的代码
func Crawl(url string, depth int, fetcher Fetcher) {
defer wg.Done()
if depth <= 0 {
return
} else if _, ok := fetched.Load(url); ok {
fmt.Printf("Skipping (already fetched): %s\n", url)
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
fetched.Store(url, nil)
for _, u := range urls {
wg.Add(1)
go Crawl(u, depth-1, fetcher)
}
}
// 由于在获取URL时可能会出现多种类型的错误事件,只有在正确处理时才标记它
var fetched sync.Map
// 对于每个Crawl,wg都会递增,并且它会在主方法中等待所有操作完成
var wg sync.WaitGroup
func main() {
wg.Add(1)
go Crawl("https://golang.org/", 4, fetcher)
wg.Wait()
}
英文:
Adding my solution for others to reference. Hope it helps
Being able to compare our different approaches is just great!
You can try below code in Go playground
func Crawl(url string, depth int, fetcher Fetcher) {
defer wg.Done()
if depth <= 0 {
return
} else if _, ok := fetched.Load(url); ok {
fmt.Printf("Skipping (already fetched): %s\n", url)
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
fetched.Store(url, nil)
for _, u := range urls {
wg.Add(1)
go Crawl(u, depth-1, fetcher)
}
}
// As there could be many types of events leading to errors when
// fetching a url, only marking it when it is correctly processed
var fetched sync.Map
// For each Crawl, wg is incremented, and it waits for all to finish
// on main method
var wg sync.WaitGroup
func main() {
wg.Add(1)
go Crawl("https://golang.org/", 4, fetcher)
wg.Wait()
}
答案24
得分: 0
这是我的解决方案:
package main
import (
"fmt"
"sync"
"time"
)
type Fetcher interface {
// Fetch返回URL的内容和在该页面上找到的URL切片。
Fetch(url string) (body string, urls []string, err error)
}
var crawlerWG = sync.WaitGroup{}
// Crawl使用fetcher递归地爬取以url开头的页面,最大深度为depth。
func Crawl(url string, depth int, fetcher Fetcher) {
defer crawlerWG.Done()
fmt.Printf("正在获取%s的数据\n", url)
// TODO: 并行获取URL。
// TODO: 不要重复获取相同的URL。
// 这个实现都没有做到:
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("找到:%s %q\n", url, body)
for _, u := range urls {
crawlerWG.Add(1)
go Crawl(u, depth-1, fetcher)
}
return
}
func main() {
startedAt := time.Now()
crawlerWG.Add(1)
Crawl("https://golang.org/", 4, fetcher)
crawlerWG.Wait()
lasted := time.Since(startedAt)
fmt.Printf("所有页面已爬取。共用时%.2f秒\n", lasted.Seconds())
}
// fakeFetcher是返回固定结果的Fetcher。
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
var fetcherMutex = sync.RWMutex{}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
fetcherMutex.RLock()
defer fetcherMutex.RUnlock()
time.Sleep(time.Millisecond * 1000)
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("未找到:%s", url)
}
// fetcher是一个填充了fakeFetcher的结构体。
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
英文:
Here is my solution:
package main
import (
"fmt"
"sync"
"time"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
var crawlerWG = sync.WaitGroup{}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
defer crawlerWG.Done()
fmt.Printf("fetching data for %s\n", url)
// TODO: Fetch URLs in parallel.
// TODO: Don't fetch the same URL twice.
// This implementation doesn't do either:
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
crawlerWG.Add(1)
go Crawl(u, depth-1, fetcher)
}
return
}
func main() {
startedAt := time.Now()
crawlerWG.Add(1)
Crawl("https://golang.org/", 4, fetcher)
crawlerWG.Wait()
lasted := time.Since(startedAt)
fmt.Printf("All pages crawled. It took %.2f seconds\n", lasted.Seconds())
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
var fetcherMutex = sync.RWMutex{}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
fetcherMutex.RLock()
defer fetcherMutex.RUnlock()
time.Sleep(time.Millisecond * 1000)
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
答案25
得分: 0
简单的解决方案,没有太多的锁和竞争:
package main
import (
"fmt"
"sync"
)
type Fetcher interface {
Fetch(url string) (body string, urls []string, err error)
}
type Cache struct {
mutex sync.Mutex
used map[string]bool
}
func Crawl(url string, depth int, fetcher Fetcher) {
var cache Cache
cache.used = make(map[string]bool)
var crawler func(string, int, chan<- string)
out := make(chan string)
crawler = func(url string, depth int, out chan<- string) {
defer close(out)
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
out <- fmt.Sprintf("%s\n", err.Error())
return
}
cache.mutex.Lock()
if _, ok := cache.used[url]; ok {
cache.mutex.Unlock()
return
}
out <- fmt.Sprintf("found: %s %q\n", url, body)
cache.used[url] = true
cache.mutex.Unlock()
rets := make([]chan string, len(urls))
for i, u := range urls {
rets[i] = make(chan string, 2)
go crawler(u, depth-1, rets[i])
}
for i := range rets {
for line := range rets[i] {
out <- line
}
}
return
}
go crawler(url, depth, out)
for line := range out {
fmt.Print(line)
}
return
}
func main() {
Crawl("https://golang.org/", 4, fetcher)
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
英文:
Simple solution without many locks and races:
package main
import (
"fmt"
"sync"
)
type Fetcher interface {
Fetch(url string) (body string, urls []string, err error)
}
type Cache struct {
mutex sync.Mutex
used map[string]bool
}
func Crawl(url string, depth int, fetcher Fetcher) {
var cache Cache
cache.used = make(map[string]bool)
var crawler func(string, int, chan<- string)
out := make(chan string)
crawler = func(url string, depth int, out chan<- string) {
defer close(out)
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
out <- fmt.Sprintf("%s\n", err.Error())
return
}
cache.mutex.Lock()
if _, ok := cache.used; ok {
cache.mutex.Unlock()
return
}
out <- fmt.Sprintf("found: %s %q\n", url, body)
cache.used = true
cache.mutex.Unlock()
rets := make([]chan string, len(urls))
for i, u := range urls {
rets[i] = make(chan string, 2)
go crawler(u, depth-1, rets[i])
}
for i := range rets {
for line := range rets[i] {
out <- line
}
}
return
}
go crawler(url, depth, out)
for line := range out {
fmt.Print(line)
}
return
}
func main() {
Crawl("https://golang.org/", 4, fetcher)
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
答案26
得分: -1
使用互斥锁和通道
package main
import (
"fmt"
"sync"
)
type SafeMap struct {
mu sync.Mutex
seen map[string]bool
}
func (s *SafeMap) getVal(url string) bool {
s.mu.Lock()
defer s.mu.Unlock()
return s.seen
}
func (s *SafeMap) setVal(url string) {
s.mu.Lock()
defer s.mu.Unlock()
s.seen = true
}
var s = SafeMap{seen: make(map[string]bool)}
type Fetcher interface {
// Fetch返回URL的主体和在该页面上找到的URL切片。
Fetch(url string) (body string, urls []string, err error)
}
// Crawl使用fetcher递归爬取
// 从url开始的页面,最大深度为depth。
func Crawl(url string, depth int, fetcher Fetcher, ch chan bool) {
// TODO: 并行获取URL。
// TODO: 不要重复获取相同的URL。
// 此实现都没有做到:
if depth <= 0 || s.getVal(url) {
ch <- false
return
}
body, urls, err := fetcher.Fetch(url)
s.setVal(url)
if err != nil {
fmt.Println(err)
ch <- false
return
}
fmt.Printf("found: %s %q\n", url, body)
chs := make(map[string]chan bool, len(urls))
for _, u := range urls {
chs[u] = make(chan bool)
go Crawl(u, depth-1, fetcher, chs[u])
}
for _,v := range urls {
<-chs[v]
}
ch <- true
return
}
func main() {
ch := make(chan bool)
go Crawl("https://golang.org/", 4, fetcher, ch)
<-ch
}
// fakeFetcher是返回固定结果的Fetcher。
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher是一个填充的fakeFetcher。
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
英文:
Using mutex and channels
package main
import (
"fmt"
"sync"
)
type SafeMap struct {
mu sync.Mutex
seen map[string]bool
}
func (s *SafeMap) getVal(url string) bool {
s.mu.Lock()
defer s.mu.Unlock()
return s.seen
}
func (s *SafeMap) setVal(url string) {
s.mu.Lock()
defer s.mu.Unlock()
s.seen = true
}
var s = SafeMap{seen: make(map[string]bool)}
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, ch chan bool) {
// TODO: Fetch URLs in parallel.
// TODO: Don't fetch the same URL twice.
// This implementation doesn't do either:
if depth <= 0 || s.getVal(url) {
ch <- false
return
}
body, urls, err := fetcher.Fetch(url)
s.setVal(url)
if err != nil {
fmt.Println(err)
ch <- false
return
}
fmt.Printf("found: %s %q\n", url, body)
chs := make(map[string]chan bool, len(urls))
for _, u := range urls {
chs[u] = make(chan bool)
go Crawl(u, depth-1, fetcher, chs[u])
}
for _,v := range urls {
<-chs[v]
}
ch <- true
return
}
func main() {
ch := make(chan bool)
go Crawl("https://golang.org/", 4, fetcher, ch)
<-ch
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
答案27
得分: -1
你可以通过使用sync.WaitGroup和生成一个单独的goroutine来关闭通道来解决关闭通道的问题。
这个解决方案不能解决避免重复访问URL的要求。
func Crawl(url string, depth int, fetcher Fetcher, ch chan string, wg *sync.WaitGroup) {
defer wg.Done()
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
ch <- fmt.Sprintln(err)
return
}
ch <- fmt.Sprintf("found: %s %q", url, body)
for _, u := range urls {
wg.Add(1)
go Crawl(u, depth-1, fetcher, ch, wg)
}
}
func main() {
ch := make(chan string)
var wg sync.WaitGroup
wg.Add(1)
go Crawl("https://golang.org/", 4, fetcher, ch, &wg)
go func() {
wg.Wait()
close(ch)
}()
for i := range ch {
fmt.Println(i)
}
}
英文:
You can solve the problem of closing the channel by using a sync.WaitGroup and spawning a separate goroutine to close the channel.
This solution does not solve for the requirement to avoid repeated visits to urls.
<!-- language: go -->
func Crawl(url string, depth int, fetcher Fetcher, ch chan string, wg *sync.WaitGroup) {
defer wg.Done()
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
ch <- fmt.Sprintln(err)
return
}
ch <- fmt.Sprintf("found: %s %q", url, body)
for _, u := range urls {
wg.Add(1)
go Crawl(u, depth-1, fetcher, ch, wg)
}
}
func main() {
ch := make(chan string)
var wg sync.WaitGroup
wg.Add(1)
go Crawl("https://golang.org/", 4, fetcher, ch, &wg)
go func() {
wg.Wait()
close(ch)
}()
for i := range ch {
fmt.Println(i)
}
}
答案28
得分: -1
package main
import (
"fmt"
"sync"
"time"
)
type SafeCounter struct {
sync.Mutex
v map[string]int
}
func (c *SafeCounter) Inc(key string) {
c.Lock()
defer c.Unlock()
c.v[key]++
}
func (c *SafeCounter) Value(key string) int {
c.Lock()
defer c.Unlock()
return c.v[key]
}
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, c *SafeCounter) {
if depth <= 0 || c.Value(url) > 0 {
return
}
c.Inc(url)
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
if c.Value(u) == 0 {
go Crawl(u, depth-1, fetcher, c)
}
}
return
}
func main() {
c := SafeCounter{v: make(map[string]int)}
Crawl("https://golang.org/", 4, fetcher, &c)
time.Sleep(time.Second)
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
英文:
package main
import (
"fmt"
"sync"
"time"
)
type SafeCounter struct {
sync.Mutex
v map[string]int
}
func (c *SafeCounter) Inc(key string) {
c.Lock()
defer c.Unlock()
c.v[key]++
}
func (c *SafeCounter) Value(key string) int {
c.Lock()
defer c.Unlock()
return c.v[key]
}
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, c *SafeCounter) {
if depth <= 0 || c.Value(url) > 0 {
return
}
c.Inc(url)
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
if c.Value(u) == 0 {
go Crawl(u, depth-1, fetcher, c)
}
}
return
}
func main() {
c := SafeCounter{v: make(map[string]int)}
Crawl("https://golang.org/", 4, fetcher, &c)
time.Sleep(time.Second)
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论