Go并发:Chudnovky算法,比sync慢。

huangapple go评论100阅读模式
英文:

Go Concurrency: Chudnovky's algorithm, slower than sync

问题

我最近在朋友的推荐下开始学习Go语言。到目前为止,我非常喜欢它,但是我写了一个(我认为会是)轻量级并发能力的完美示例,结果却出乎意料...所以我怀疑我做错了什么,或者我对goroutine的开销理解有误。我希望这里的一些Go语言专家能够提供一些见解。

我使用goroutine和简单的同步执行来编写了Chudnovsky算法的Go语言版本。我假设,由于每个计算都是相互独立的,使用并发运行至少会稍微快一点。

注意:我在一台第五代i7上运行这个程序,所以如果goroutine像我被告知的那样被复用到线程上,这应该是并发和并行的。

package main

import (
  "fmt"
  "math"
  "strconv"
  "time"
)

func main() {
  var input string
  var sum float64
  var pi float64
  c := make(chan float64)

  fmt.Print("How many iterations? ")
  fmt.Scanln(&input)
  max,err := strconv.Atoi(input)

  if err != nil {
    panic("You did not enter a valid integer")
  }
  start := time.Now() //start timing execution of concurrent routine

  for i := 0; i < max; i++ {
    go chudnovskyConcurrent(i,c)
  }

  for i := 0; i < max; i++ {
    sum += <-c
  }
  end := time.Now() //end of concurrent routine
  fmt.Println("Duration of concurrent calculation: ",end.Sub(start))
  pi = 1/(12*sum)
  fmt.Println(pi)

  start = time.Now() //start timing execution of syncronous routine
  sum = 0
  for i := 0; i < max; i++ {
    sum += chudnovskySync(i)
  }
  end = time.Now() //end of syncronous routine
  fmt.Println("Duration of synchronous calculation: ",end.Sub(start))
  pi = 1/(12*sum)
  fmt.Println(pi)
}

func chudnovskyConcurrent(i int, c chan<- float64) {
  var numerator float64
  var denominator float64
  ifloat := float64(i)
  iun := uint64(i)
  numerator = math.Pow(-1, ifloat) * float64(factorial(6*iun)) * (545140134*ifloat+13591409)
  denominator = float64(factorial(3*iun)) * math.Pow(float64(factorial(iun)),3) * math.Pow(math.Pow(640320,3),ifloat+0.5)
  c <- numerator/denominator
}

func chudnovskySync(i int) (r float64) {
  var numerator float64
  var denominator float64
  ifloat := float64(i)
  iun := uint64(i)
  numerator = math.Pow(-1, ifloat) * float64(factorial(6*iun)) * (545140134*ifloat+13591409)
  denominator = float64(factorial(3*iun)) * math.Pow(float64(factorial(iun)),3) * math.Pow(math.Pow(640320,3),ifloat+0.5)
  r = numerator/denominator
  return
}

func factorial(n uint64) (res uint64) {
  if ( n > 0 ) {
    res = n * factorial(n-1)
    return res
  }

  return 1
}

这是我的结果:

How many iterations? 20
Duration of concurrent calculation: 573.944μs
3.1415926535897936
Duration of synchronous calculation: 63.056μs
3.1415926535897936
英文:

I just recently started learning go at the recommendation of a friend. So far, I am loving it, but I wrote (what I thought would be) the perfect example of the power of lightweight concurrency, and got a surprising result... so I suspect I'm doing something wrong, or I'm misunderstanding how expensive goroutines are. I'm hoping some gophers here can provide insight.

I wrote Chudnovsky's algorithm in Go using both goroutines and simple synchronous execution. I assumed, with each calculation being independent of the others, it'd be at least a little faster running concurrently.

note: I am running this on a 5th gen i7, so if goroutines are multiplexed onto threads as I was told, this should be both concurrent and parallel.

 package main
import (
&quot;fmt&quot;
&quot;math&quot;
&quot;strconv&quot;
&quot;time&quot;
)
func main() {
var input string
var sum float64
var pi float64
c := make(chan float64)
fmt.Print(&quot;How many iterations? &quot;)
fmt.Scanln(&amp;input)
max,err := strconv.Atoi(input)
if err != nil {
panic(&quot;You did not enter a valid integer&quot;)
}
start := time.Now() //start timing execution of concurrent routine
for i := 0; i &lt; max; i++ {
go chudnovskyConcurrent(i,c)
}
for i := 0; i &lt; max; i++ {
sum += &lt;-c
}
end := time.Now() //end of concurrent routine
fmt.Println(&quot;Duration of concurrent calculation: &quot;,end.Sub(start))
pi = 1/(12*sum)
fmt.Println(pi)
start = time.Now() //start timing execution of syncronous routine
sum = 0
for i := 0; i &lt; max; i++ {
sum += chudnovskySync(i)
}
end = time.Now() //end of syncronous routine
fmt.Println(&quot;Duration of synchronous calculation: &quot;,end.Sub(start))
pi = 1/(12*sum)
fmt.Println(pi)
}
func chudnovskyConcurrent(i int, c chan&lt;- float64) {
var numerator float64
var denominator float64
ifloat := float64(i)
iun := uint64(i)
numerator = math.Pow(-1, ifloat) * float64(factorial(6*iun)) * (545140134*ifloat+13591409)
denominator = float64(factorial(3*iun)) * math.Pow(float64(factorial(iun)),3) * math.Pow(math.Pow(640320,3),ifloat+0.5)
c &lt;- numerator/denominator
}
func chudnovskySync(i int) (r float64) {
var numerator float64
var denominator float64
ifloat := float64(i)
iun := uint64(i)
numerator = math.Pow(-1, ifloat) * float64(factorial(6*iun)) * (545140134*ifloat+13591409)
denominator = float64(factorial(3*iun)) * math.Pow(float64(factorial(iun)),3) * math.Pow(math.Pow(640320,3),ifloat+0.5)
r = numerator/denominator
return
}
func factorial(n uint64) (res uint64) {
if ( n &gt; 0 ) {
res = n * factorial(n-1)
return res
}
return 1
}

And here are my results:

How many iterations? 20
Duration of concurrent calculation:  573.944&#181;s
3.1415926535897936
Duration of synchronous calculation:  63.056&#181;s
3.1415926535897936

答案1

得分: 3

你正在做的计算太简单了,每个计算都在单独的goroutine中执行是没有必要的。你在运行时花费的时间(创建goroutines、多路复用、调度等)比实际计算时间还要多。你所尝试的做法更适合使用GPU,例如,在那里你有大量的并行执行单元,可以在瞬间完成这些简单的计算。但是你需要其他的语言和API来实现这一点。

你可以为每个硬件线程创建一个软件执行线程。你想要将max变量分成大块,并并行执行它们。这里有一个非常简单的示例来说明这个想法:

package main

import (
  "fmt"
  "math"
  "strconv"
  "time"
  "runtime"
)

func main() {
  var input string
  var sum float64
  var pi float64
  c := make(chan float64, runtime.GOMAXPROCS(-1))
  fmt.Print("How many iterations? ")
  fmt.Scanln(&input)
  max,err := strconv.Atoi(input)
  
  if err != nil {
    panic("You did not enter a valid integer")
  }
  start := time.Now() //start timing execution of concurrent routine

  for i := 0; i < runtime.GOMAXPROCS(-1); i++ {
    go func(i int){
      var sum float64
      for j := 0; j < max/runtime.GOMAXPROCS(-1); j++  {
        sum += chudnovskySync(j + i*max/runtime.GOMAXPROCS(-1))
      }
      c <- sum
    }(i)
  }
  
  for i := 0; i < runtime.GOMAXPROCS(-1); i++ {
    sum += <-c
  }
  end := time.Now() //end of concurrent routine
  fmt.Println("Duration of concurrent calculation: ",end.Sub(start))
  pi = 1/(12*sum)
  fmt.Println(pi)

  start = time.Now() //start timing execution of syncronous routine
  sum = 0
  for i := 0; i < max; i++ {
    sum += chudnovskySync(i)
  }
  end = time.Now() //end of syncronous routine
  fmt.Println("Duration of synchronous calculation: ",end.Sub(start))
  pi = 1/(12*sum)
  fmt.Println(pi)
}

func chudnovskySync(i int) (r float64) {
  var numerator float64
  var denominator float64
  ifloat := float64(i)
  iun := uint64(i)
  numerator = math.Pow(-1, ifloat) * float64(factorial(6*iun)) * (545140134*ifloat+13591409)
  denominator = float64(factorial(3*iun)) * math.Pow(float64(factorial(iun)),3) * math.Pow(math.Pow(640320,3),ifloat+0.5)
  r = numerator/denominator
  return
}

func factorial(n uint64) (res uint64) {
  if ( n > 0 ) {
    res = n * factorial(n-1)
    return res
  }

  return 1
}

这是结果:

$ go version
go version go1.5.2 windows/amd64
$ go run main.go
GOMAXPROCS = 4
How many iterations? 10000
Duration of concurrent calculation: 932.8916ms
NaN
Duration of synchronous calculation: 2.0639744s
NaN
英文:

The calculations you're doing are too simple to do each one of them in a separate goroutine. You're loosing more time in the runtime (creating goroutines, multiplexing, scheduling etc) than on actual calculations. What you're trying to do is more suited for GPU, for example, where you have massive number of parallel execution units that can do this simple calculations all at ones in an instant. But you would need other languages and APIs to do that.

What you can do is to create software thread of execution for every hardware thread of execution. You want to split your max variable into big chunks and execute them in parallel. Here's very simple take on it just to illustrate the idea:

package main
import (
&quot;fmt&quot;
&quot;math&quot;
&quot;strconv&quot;
&quot;time&quot;
&quot;runtime&quot;
)
func main() {
var input string
var sum float64
var pi float64
c := make(chan float64, runtime.GOMAXPROCS(-1))
fmt.Print(&quot;How many iterations? &quot;)
fmt.Scanln(&amp;input)
max,err := strconv.Atoi(input)
if err != nil {
panic(&quot;You did not enter a valid integer&quot;)
}
start := time.Now() //start timing execution of concurrent routine
for i := 0; i &lt; runtime.GOMAXPROCS(-1); i++ {
go func(i int){
var sum float64
for j := 0; j &lt; max/runtime.GOMAXPROCS(-1); j++  {
sum += chudnovskySync(j + i*max/runtime.GOMAXPROCS(-1))
}
c &lt;- sum
}(i)
}
for i := 0; i &lt; runtime.GOMAXPROCS(-1); i++ {
sum += &lt;-c
}
end := time.Now() //end of concurrent routine
fmt.Println(&quot;Duration of concurrent calculation: &quot;,end.Sub(start))
pi = 1/(12*sum)
fmt.Println(pi)
start = time.Now() //start timing execution of syncronous routine
sum = 0
for i := 0; i &lt; max; i++ {
sum += chudnovskySync(i)
}
end = time.Now() //end of syncronous routine
fmt.Println(&quot;Duration of synchronous calculation: &quot;,end.Sub(start))
pi = 1/(12*sum)
fmt.Println(pi)
}
func chudnovskySync(i int) (r float64) {
var numerator float64
var denominator float64
ifloat := float64(i)
iun := uint64(i)
numerator = math.Pow(-1, ifloat) * float64(factorial(6*iun)) * (545140134*ifloat+13591409)
denominator = float64(factorial(3*iun)) * math.Pow(float64(factorial(iun)),3) * math.Pow(math.Pow(640320,3),ifloat+0.5)
r = numerator/denominator
return
}
func factorial(n uint64) (res uint64) {
if ( n &gt; 0 ) {
res = n * factorial(n-1)
return res
}
return 1
}

And here's the results

$ go version
go version go1.5.2 windows/amd64
$ go run main.go
GOMAXPROCS = 4
How many iterations? 10000
Duration of concurrent calculation:  932.8916ms
NaN
Duration of synchronous calculation:  2.0639744s
NaN 

答案2

得分: 0

我同意,你的计算量不足以弥补使用多个goroutine所带来的开销。只是为了好玩,我修改了你的代码,在返回结果之前多次进行计算(1000次、10000次、100000次、1000000次)。我在运行这个代码(进行了20次迭代)时使用的是运行在四核Xeon上的Mac OS X Yosemite,正如你所预期的那样,同步版本所需的时间大约是并行版本的四倍。

有趣的是,我注意到,当重复次数很大时,同步版本实际上所需的时间超过了并行版本的四倍。我猜测这可能与英特尔的超线程架构有关,该架构允许在每个核心内部进行一定程度的并行处理,但我对此并不确定。

英文:

I agree, your calculations don't do enough processing to overcome the overhead of having multiple goroutines. Just for fun, I modified your code to do the calculation many times (1000, 10000, 100000, 1000000) before returning the result. I ran this (with 20 iterations) under Mac OS X Yosemite running on a quad core Xeon, and, as you might expect, the synchronous version takes in the neighborhood of four times as long as the parallel version.

One interesting thing I noticed is that, with a large number of repetitions, the synchronous version actually takes more than four times as long as the parallel version. I'm guessing this has something to do with Intel's hyperthreading architecture which allows some level of parallelism within each core, but I'm not sure about that.

答案3

得分: 0

你可能需要在每个线程中执行多个序列项,以提高速度,因为生成线程的开销较大。以下是使用较少线程和每个线程更多项的代码示例(但使用了不同的算法):

package main

import (
	"fmt"
	"math/rand"
	"runtime"
	"time"
)

var precision int = 2147483647
var inside int = 0
var size int = 2147483647
var threads int = runtime.NumCPU()

func getPi(total int) float64 {
	return (float64(inside) / float64(total)) * 4
}

func addpoints(times int, out chan int) {
	r1 := rand.New(rand.NewSource(time.Now().UnixNano()))
	var x, y, ret int
	ret = 0
	for i := 0; i < times; i++ {
		x = r1.Intn(size)
		y = r1.Intn(size)
		if ((x * x) + (y * y)) < (size * size) {
			ret += 1
		}
	}
	out <- ret
}

func main() {
	fmt.Println("running on " + fmt.Sprint(threads) + " threads")
	start := time.Now()
	results := make(chan int, threads)
	for i := 0; i < threads; i++ {
		go addpoints(precision/threads, results)
	}
	for i := 0; i < threads; i++ {
		inside += <-results
	}
	duration := time.Since(start)
	fmt.Println(getPi(precision))
	fmt.Println("compute time: " + fmt.Sprint(duration))
}

希望对你有帮助!

英文:

You probably need to do multiple terms of the sequence per thread to be faster due to overhead spawning threads.
such as the following code which uses less threads and more terms per thread (but with a different algorithm)

package main
import (
&quot;fmt&quot;
&quot;math/rand&quot;
&quot;runtime&quot;
&quot;time&quot;
)
var precision int = 2147483647
var inside int = 0
var size int = 2147483647
var threads int = runtime.NumCPU()
func getPi(total int) float64 {
return (float64(inside) / float64(total)) * 4
}
func addpoints(times int, out chan int) {
r1 := rand.New(rand.NewSource(time.Now().UnixNano()))
var x, y, ret int
ret = 0
for i := 0; i &lt; times; i++ {
x = r1.Intn(size)
y = r1.Intn(size)
if ((x * x) + (y * y)) &lt; (size * size) {
ret += 1
}
}
out &lt;- ret
}
func main() {
fmt.Println(&quot;running on &quot; + fmt.Sprint(threads) + &quot; threads&quot;)
start := time.Now()
results := make(chan int, threads)
for i := 0; i &lt; threads; i++ {
go addpoints(precision/threads, results)
}
for i := 0; i &lt; threads; i++ {
inside += &lt;-results
}
duration := time.Since(start)
fmt.Println(getPi(precision))
fmt.Println(&quot;compute time: &quot; + fmt.Sprint(duration))
}

huangapple
  • 本文由 发表于 2016年3月20日 00:14:09
  • 转载请务必保留本文链接:https://go.coder-hub.com/36104038.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定