Best time to close channel, when iterating over channel - go

I am playing around with Golang and I created this little app to make several concurrent api calls using goroutines.
While the app works, after the calls complete, the app gets stuck, which makes sense because it cannot exit the range c loop because the channel is not closed.
I am not sure where to better close the channel in this pattern.
package main
import "fmt"
import "net/http"
func main() {
links := []string{
"https://github.com/fabpot",
"https://github.com/andrew",
"https://github.com/taylorotwell",
"https://github.com/egoist",
"https://github.com/HugoGiraudel",
}
checkUrls(links)
}
func checkUrls(urls []string) {
c := make(chan string)
for _, link := range urls {
go checkUrl(link, c)
}
for msg := range c {
fmt.Println(msg)
}
close(c) //this won't get hit
}
func checkUrl(url string, c chan string) {
_, err := http.Get(url)
if err != nil {
c <- "We could not reach:" + url
} else {
c <- "Success reaching the website:" + url
}
}

You close a channel when there are no more values to send, so in this case it's when all checkUrl goroutines have completed.
var wg sync.WaitGroup
func checkUrls(urls []string) {
c := make(chan string)
for _, link := range urls {
wg.Add(1)
go checkUrl(link, c)
}
go func() {
wg.Wait()
close(c)
}()
for msg := range c {
fmt.Println(msg)
}
}
func checkUrl(url string, c chan string) {
defer wg.Done()
_, err := http.Get(url)
if err != nil {
c <- "We could not reach:" + url
} else {
c <- "Success reaching the website:" + url
}
}
(Note that the error from http.Get is only going to reflect connection and protocol errors. It is not going to contain http server errors if you're expecting those too, which you must be seeing how you're checking for paths and not just hosts.)

When writing programs in Go using channels and goroutines always think about who (which function) owns a channel. I prefer the practice of letting the function who owns a channel close it. If i were to write this i would do as shown below.
Note: A better way to handle situations like this is the Fan-out, fan-in concurrency pattern. refer(https://blog.golang.org/pipelines)Go Concurrency Patterns
package main
import "fmt"
import "net/http"
import "sync"
func main() {
links := []string{
"https://github.com/fabpot",
"https://github.com/andrew",
"https://github.com/taylorotwell",
"https://github.com/egoist",
"https://github.com/HugoGiraudel",
}
processURLS(links)
fmt.Println("End of Main")
}
func processURLS(links []string) {
resultsChan := checkUrls(links)
for msg := range resultsChan {
fmt.Println(msg)
}
}
func checkUrls(urls []string) chan string {
outChan := make(chan string)
go func(urls []string) {
defer close(outChan)
var wg sync.WaitGroup
for _, url := range urls {
wg.Add(1)
go checkUrl(&wg, url, outChan)
}
wg.Wait()
}(urls)
return outChan
}
func checkUrl(wg *sync.WaitGroup, url string, c chan string) {
defer wg.Done()
_, err := http.Get(url)
if err != nil {
c <- "We could not reach:" + url
} else {
c <- "Success reaching the website:" + url
}
}

Related

Concurrency issues with crawler

I try to build concurrent crawler based on Tour and some others SO answers regarding that. What I have currently is below but I think I have here two subtle issues.
Sometimes I get 16 urls in response and sometimes 17 (debug print in main). I know it because when I even change WriteToSlice to Read then in Read sometimes 'Read: end, counter = ' is never reached and it's always when I get 16 urls.
I have troubles with err channel, I get no messages in this channel, even when I run my main Crawl method with address like www.golang.org so without valid schema error should be send via err channel
Concurrency is really difficult topic, help and advice will be appreciated
package main
import (
"fmt"
"net/http"
"sync"
"golang.org/x/net/html"
)
type urlCache struct {
urls map[string]struct{}
sync.Mutex
}
func (v *urlCache) Set(url string) bool {
v.Lock()
defer v.Unlock()
_, exist := v.urls[url]
v.urls[url] = struct{}{}
return !exist
}
func newURLCache() *urlCache {
return &urlCache{
urls: make(map[string]struct{}),
}
}
type results struct {
data chan string
err chan error
}
func newResults() *results {
return &results{
data: make(chan string, 1),
err: make(chan error, 1),
}
}
func (r *results) close() {
close(r.data)
close(r.err)
}
func (r *results) WriteToSlice(s *[]string) {
for {
select {
case data := <-r.data:
*s = append(*s, data)
case err := <-r.err:
fmt.Println("e ", err)
}
}
}
func (r *results) Read() {
fmt.Println("Read: start")
counter := 0
for c := range r.data {
fmt.Println(c)
counter++
}
fmt.Println("Read: end, counter = ", counter)
}
func crawl(url string, depth int, wg *sync.WaitGroup, cache *urlCache, res *results) {
defer wg.Done()
if depth == 0 || !cache.Set(url) {
return
}
response, err := http.Get(url)
if err != nil {
res.err <- err
return
}
defer response.Body.Close()
node, err := html.Parse(response.Body)
if err != nil {
res.err <- err
return
}
urls := grablUrls(response, node)
res.data <- url
for _, url := range urls {
wg.Add(1)
go crawl(url, depth-1, wg, cache, res)
}
}
func grablUrls(resp *http.Response, node *html.Node) []string {
var f func(*html.Node) []string
var results []string
f = func(n *html.Node) []string {
if n.Type == html.ElementNode && n.Data == "a" {
for _, a := range n.Attr {
if a.Key != "href" {
continue
}
link, err := resp.Request.URL.Parse(a.Val)
if err != nil {
continue
}
results = append(results, link.String())
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
return results
}
res := f(node)
return res
}
// Crawl ...
func Crawl(url string, depth int) []string {
wg := &sync.WaitGroup{}
output := &[]string{}
visited := newURLCache()
results := newResults()
defer results.close()
wg.Add(1)
go crawl(url, depth, wg, visited, results)
go results.WriteToSlice(output)
// go results.Read()
wg.Wait()
return *output
}
func main() {
r := Crawl("https://www.golang.org", 2)
// r := Crawl("www.golang.org", 2) // no schema, error should be generated and send via err
fmt.Println(len(r))
}
Both your questions 1 and 2 are a result of the same bug.
In Crawl() you are not waiting for this go routine to finish: go results.WriteToSlice(output). On the last crawl() function, the wait group is released, the output is returned and printed before the WriteToSlice function finishes with the data and err channel. So what has happened is this:
crawl() finishes, placing data in results.data and results.err.
Waitgroup wait() unblocks, causing main() to print the length of the result []string
WriteToSlice adds the last data (or err) item to the channel
You need to return from Crawl() not only when the data is done being written to the channel, but also when the channel is done being read in it's entirety (including the buffer). A good way to do this is close channels when you are sure that you are done with them. By organizing your code this way, you can block on the go routine that is draining the channels, and instead of using the wait group to release to main, you wait until the channels are 100% done.
You can see this gobyexample https://gobyexample.com/closing-channels. Remember that when you close a channel, the channel can still be used until the last item is taken. So you can close a buffered channel, and the reader will still get all the items that were queued in the channel.
There is some code structure that can change to make this cleaner, but here is a quick way to fix your program. Change Crawl to block on WriteToSlice. Close the data channel when the crawl function finishes, and wait for WriteToSlice to finish.
// Crawl ...
func Crawl(url string, depth int) []string {
wg := &sync.WaitGroup{}
output := &[]string{}
visited := newURLCache()
results := newResults()
go func() {
wg.Add(1)
go crawl(url, depth, wg, visited, results)
wg.Wait()
// All data is written, this makes `WriteToSlice()` unblock
close(results.data)
}()
// This will block until results.data is closed
results.WriteToSlice(output)
close(results.err)
return *output
}
Then on write to slice, you have to check for the closed channel to exit the for loop:
func (r *results) WriteToSlice(s *[]string) {
for {
select {
case data, open := <-r.data:
if !open {
return // All data done
}
*s = append(*s, data)
case err := <-r.err:
fmt.Println("e ", err)
}
}
}
Here is the full code: https://play.golang.org/p/GBpGk-lzrhd (it won't work in the playground)

Defer never called when writing to channel

I'm trying to write to a channel as last action in a goroutine function.
Unfortunately this is not working. And the waitGroup is never done.
import (
"sync"
"github.com/SlyMarbo/rss"
"fmt"
)
func main() {
urls := []string{"http://rss.cnn.com/rss/edition.rss", "http://rss.time.com/web/time/rss/top/index.xml"}
var c = make(chan string)
var wg sync.WaitGroup
for _, url := range urls {
wg.Add(1)
go receiveRss(url, &wg, c)
}
wg.Wait()
fmt.Println("==============DONE=================")
}
func receiveRss(url string, wg *sync.WaitGroup, c chan string) {
defer wg.Done()
feed, err := rss.Fetch(url)
if err != nil {
fmt.Println("Failed to retrieve RSS feed", err)
}
items := feed.Items
for _, item := range items {
c <- item.Title
}
}
When replacing c <- item.Title with fmt.Println(item.Title) the deferred function is called and DONE is printed.
The problem is that I was only writing to the channel. Never reading from it.
Without doing this the channel would be useless.
The solution to this is:
Reading from the channel after the loop which starts the goroutines:
for title := range c {
fmt.Println(title)
}
This then causes an endless loop if the channel is never closed.
So I just close the channel after writing to it:
close(c)
Here is the whole code:
func main() {
urls := []string{"http://rss.cnn.com/rss/edition.rss", "http://rss.time.com/web/time/rss/top/index.xml"}
var c = make(chan []string)
var wg sync.WaitGroup
for _, url := range urls {
wg.Add(1)
go receiveRss(url, &wg, c)
}
for title := range c {
fmt.Println(title)
}
wg.Wait()
fmt.Println("==============DONE=================")
}
func receiveRss(url string, wg *sync.WaitGroup, c chan []string) {
defer wg.Done()
feed, err := rss.Fetch(url)
if err != nil {
fmt.Println("Failed to retrieve RSS feed", err)
}
items := feed.Items
var titles []string
for _, item := range items {
titles = append(titles, item.Title)
}
c <- titles
close(c)
}

Golang app using sync.WaitGroup & channels never exits

I use sync.WaitGroup, defer wg.Close() and wg.Wait() to wait for my goroutines to complete.
The program do wait, but it never exits.
This is my program (runnable):
package main
import (
"fmt"
"io"
"log"
"net/http"
"os"
"sync"
)
var symbols = []string{
"ASSA-B.ST",
"ELUX-B.ST",
"HM-B.ST",
}
func main() {
fmt.Println("fetching quotes...")
fetchedSymbols := make(chan string)
var wg sync.WaitGroup
wg.Add(len(symbols))
for _, symbol := range symbols {
go fetchSymbol(symbol, &wg, fetchedSymbols)
}
for response := range fetchedSymbols {
fmt.Println("fetched " + response)
}
wg.Wait()
fmt.Println("done")
}
func fetchSymbol(symbol string, wg *sync.WaitGroup, c chan<- string) {
defer wg.Done()
resp, err := http.Get("http://ichart.yahoo.com/table.csv?s=" + symbol + "&a=0&b=1&c=2000")
defer resp.Body.Close()
if err != nil {
log.Fatal(err)
}
out, err := os.Create("./stock-quotes/" + symbol + ".csv")
defer out.Close()
if err != nil {
log.Fatal(err)
}
io.Copy(out, resp.Body)
c <- symbol
}
Shouldn't this program exit when all the quotes have been downloaded? (FYI: I just started learning GO)
You're never closing the fetchedSymbols channel, so that range loop will never exit.
One way to handle this is to use use the WaitGroup you already have to signal when to close the channel. Ranging over fetchedSymbols is enough to block the progress in main, and you don't need another channel or WaitGroup.
...
go func() {
wg.Wait()
close(fetchedSymbols)
}()
for response := range fetchedSymbols {
fmt.Println("fetched " + response)
}
...

Golang program hangs without finishing execution

I have the following golang program;
package main
import (
"fmt"
"net/http"
"time"
)
var urls = []string{
"http://www.google.com/",
"http://golang.org/",
"http://yahoo.com/",
}
type HttpResponse struct {
url string
response *http.Response
err error
status string
}
func asyncHttpGets(url string, ch chan *HttpResponse) {
client := http.Client{}
if url == "http://www.google.com/" {
time.Sleep(500 * time.Millisecond) //google is down
}
fmt.Printf("Fetching %s \n", url)
resp, err := client.Get(url)
u := &HttpResponse{url, resp, err, "fetched"}
ch <- u
fmt.Println("sent to chan")
}
func main() {
fmt.Println("start")
ch := make(chan *HttpResponse, len(urls))
for _, url := range urls {
go asyncHttpGets(url, ch)
}
for i := range ch {
fmt.Println(i)
}
fmt.Println("Im done")
}
Run it on Playground
However when I run it; it hangs (ie the last part that ought to print Im done doesnt run.)
Here's the terminal output;;
$ go run get.go
start
Fetching http://yahoo.com/
Fetching http://golang.org/
Fetching http://www.google.com/
sent to chan
&{http://www.google.com/ 0xc820144120 fetched}
sent to chan
&{http://golang.org/ 0xc82008b710 fetched}
sent to chan
&{http://yahoo.com/ 0xc82008b7a0 fetched}
The problem is that ranging over a channel in a for loop will continue forever unless the channel is closed. If you want to read precisely len(urls) values from the channel, you should loop that many times:
for i := 0; i < len(urls); i++ {
fmt.Println(<-ch)
}
Another good dirty devious trick would be to use sync.WaitGroup and increment it per goroutine and then monitor it with a Wait and after its done it will close your channel allowing the next blocks of code to run, the reason I am offering you this approach is because it gets away from using a static number in a loop like len(urls) so that you can have a dynamic slice that might change and what not.
The reason Wait and close are in their own goroutine is so that your code can reach the for loop to range over your channel
package main
import (
"fmt"
"net/http"
"time"
"sync"
)
var urls = []string{
"http://www.google.com/",
"http://golang.org/",
"http://yahoo.com/",
}
type HttpResponse struct {
url string
response *http.Response
err error
status string
}
func asyncHttpGets(url string, ch chan *HttpResponse, wg *sync.WaitGroup) {
client := http.Client{}
if url == "http://www.google.com/" {
time.Sleep(500 * time.Millisecond) //google is down
}
fmt.Printf("Fetching %s \n", url)
resp, err := client.Get(url)
u := &HttpResponse{url, resp, err, "fetched"}
ch <- u
fmt.Println("sent to chan")
wg.Done()
}
func main() {
fmt.Println("start")
ch := make(chan *HttpResponse, len(urls))
var wg sync.WaitGroup
for _, url := range urls {
wg.Add(1)
go asyncHttpGets(url, ch, &wg)
}
go func() {
wg.Wait()
close(ch)
}()
for i := range ch {
fmt.Println(i)
}
fmt.Println("Im done")
}

How do I handle errors in a worker pool using WaitGroup?

I got a problem using sync.WaitGroup and select together. If you take a look at following http request pool you will notice that if an error occurs it will never be reported as wg.Done() will block and there is no read from the channel anymore.
package pool
import (
"fmt"
"log"
"net/http"
"sync"
)
var (
MaxPoolQueue = 100
MaxPoolWorker = 10
)
type Pool struct {
wg *sync.WaitGroup
queue chan *http.Request
errors chan error
}
func NewPool() *Pool {
return &Pool{
wg: &sync.WaitGroup{},
queue: make(chan *http.Request, MaxPoolQueue),
errors: make(chan error),
}
}
func (p *Pool) Add(r *http.Request) {
p.wg.Add(1)
p.queue <- r
}
func (p *Pool) Run() error {
for i := 0; i < MaxPoolWorker; i++ {
go p.doWork()
}
select {
case err := <-p.errors:
return err
default:
p.wg.Wait()
}
return nil
}
func (p *Pool) doWork() {
for r := range p.queue {
fmt.Printf("Request to %s\n", r.Host)
p.wg.Done()
_, err := http.DefaultClient.Do(r)
if err != nil {
log.Fatal(err)
p.errors <- err
} else {
fmt.Printf("no error\n")
}
}
}
Source can be found here
How can I still use WaitGroup but also get errors from go routines?
Just got the answer my self as I wrote the question and as I think it is an interesting case I would like to share it with you.
The trick to use sync.WaitGroup and chan together is that we wrap:
select {
case err := <-p.errors:
return err
default:
p.wg.Done()
}
Together in a for loop:
for {
select {
case err := <-p.errors:
return err
default:
p.wg.Done()
}
}
In this case select will always check for errors and wait if nothing happens :)
It looks a bit like the fail-fast mechanism enabled by the Tomb library (Tomb V2 GoDoc):
The tomb package handles clean goroutine tracking and termination.
If any of the tracked goroutines returns a non-nil error, or the Kill or Killf method is called by any goroutine in the system (tracked or not), the tomb Err is set, Alive is set to false, and the Dying channel is closed to flag that all tracked goroutines are supposed to willingly terminate as soon as possible.
Once all tracked goroutines terminate, the Dead channel is closed, and Wait unblocks and returns the first non-nil error presented to the tomb via a result or an explicit Kill or Killf method call, or nil if there were no errors.
You can see an example in this playground:
(extract)
// start runs all the given functions concurrently
// until either they all complete or one returns an
// error, in which case it returns that error.
//
// The functions are passed a channel which will be closed
// when the function should stop.
func start(funcs []func(stop <-chan struct{}) error) error {
var tomb tomb.Tomb
var wg sync.WaitGroup
allDone := make(chan struct{})
// Start all the functions.
for _, f := range funcs {
f := f
wg.Add(1)
go func() {
defer wg.Done()
if err := f(tomb.Dying()); err != nil {
tomb.Kill(err)
}
}()
}
// Start a goroutine to wait for them all to finish.
go func() {
wg.Wait()
close(allDone)
}()
// Wait for them all to finish, or one to fail
select {
case <-allDone:
case <-tomb.Dying():
}
tomb.Done()
return tomb.Err()
}
A simpler implementation would be like below. (Check in play.golang: https://play.golang.org/p/TYxxsDRt5Wu)
package main
import "fmt"
import "sync"
import "time"
type Error struct {
message string
}
func (e Error) Error() string {
return e.message
}
func main() {
var wg sync.WaitGroup
waitGroupLength := 8
errChannel := make(chan error, 1)
// Setup waitgroup to match the number of go routines we'll launch off
wg.Add(waitGroupLength)
finished := make(chan bool, 1) // this along with wg.Wait() are why the error handling works and doesn't deadlock
for i := 0; i < waitGroupLength; i++ {
go func(i int) {
fmt.Printf("Go routine %d executed\n", i+1)
time.Sleep(time.Duration(waitGroupLength - i))
time.Sleep(0) // only here so the time import is needed
if i%4 == 1 {
errChannel <- Error{fmt.Sprintf("Errored on routine %d", i+1)}
}
// Mark the wait group as Done so it does not hang
wg.Done()
}(i)
}
go func() {
wg.Wait()
close(finished)
}()
L:
for {
select {
case <-finished:
break L // this will break from loop
case err := <-errChannel:
if err != nil {
fmt.Println("error ", err)
// handle your error
}
}
}
fmt.Println("Executed all go routines")
}

Resources