go routines hang on download of large files - go

I am attempting to write an application that will download a range of images.
130 116kb images (works)
50 500kb images (works)
130 500kb images (eventually hangs)
230 116kb images (eventually hangs)
go version go1.9.2 darwin/amd64
package main
import (
"fmt"
"io"
"log"
"net/http"
"os"
"sync"
)
func main() {
var urls []string
// var smallImage = "https://s3.amazonaws.com/golangplayground/116kb.jpeg" //116kb
var largeImage = "https://s3.amazonaws.com/golangplayground/SampleJPGImage_500kbmb.jpg" //500kb
for i := 0; i < 130; i++ {
urls = append(urls, largeImage)
}
var wg sync.WaitGroup
wg.Add(len(urls))
var inc = 0
for _, val := range urls {
inc += 1
go saveResourceFromURLToDisk(val, "./foo", &wg, inc)
}
wg.Wait()
fmt.Println("done.")
}
func saveResourceFromURLToDisk(url string, writeTo string, wg *sync.WaitGroup, inc int) error {
defer wg.Done()
response, err := http.Get(url)
if err != nil {
log.Fatal(err)
return err
}
defer response.Body.Close()
localPath := fmt.Sprintf("%s/%d", writeTo, inc)
file, err := os.Create(localPath)
if err != nil {
log.Fatal(err)
return err
}
defer file.Close()
_, err = io.Copy(file, response.Body)
if err != nil {
log.Fatal(err)
return err
}
fmt.Println(localPath)
return nil
}

This is probably a network problem. There's a reason web browsers have limits on how many sessions to open to the same server.
If you open a bunch of TCP sessions all at the same time, almost all of them will lose packets. Then they'll all try to retry at about the same time, losing more packets. It is just a big pile of lose.
Place a small delay between opening each GET request or limit yourself to 4 - 8 simultaneous downloads from the same server.

I found the answer with Zan's help by bucketing my go routines in 5 requests each... This way I get to take advantage of some parallelism while throttling the amount of open connections I am creating.
Its a bit naive and I am wondering if anyone has a more elegant solution.
package main
import (
"fmt"
"io"
"log"
"net/http"
"os"
"sync"
)
func main() {
var urls []string
// var smallImage = "https://s3.amazonaws.com/golangplayground/116kb.jpeg" //116kb
var largeImage = "https://s3.amazonaws.com/golangplayground/SampleJPGImage_500kbmb.jpg" //500kb
for i := 0; i < 150; i++ {
urls = append(urls, largeImage)
}
var inc = 0;
for x:=0; x < len(urls)/5; x++ {
var wg sync.WaitGroup
for y:=0; y<5; y++ {
wg.Add(1)
go saveResourceFromURLToDisk(urls[x*y], "./foo", &wg, inc)
inc += 1
}
wg.Wait()
}
fmt.Println("done.")
}
func saveResourceFromURLToDisk(url string, writeTo string, wg *sync.WaitGroup, inc int) error {
defer wg.Done()
response, err := http.Get(url)
if err != nil {
log.Fatal(err)
return err
}
defer response.Body.Close()
localPath := fmt.Sprintf("%s/%d", writeTo, inc)
file, err := os.Create(localPath)
if err != nil {
log.Fatal(err)
return err
}
defer file.Close()
_, err = io.Copy(file, response.Body)
if err != nil {
log.Fatal(err)
return err
}
fmt.Println(localPath)
return nil
}

Related

Go: negative WaitGroup counter

I'm somewhat new to go and am reworking code that I found somewhere else to fit my needs. Because of that, I don't totally understand what is happening here, although I get the general idea.
I'm running a few websocket clients using go routines, but I'm getting an unexpected error that causes the program to crash. My program seems to close one too many threads (excuse me if this is the wrong terminology) when there is an error reading a message from the websocket (check the conn.ReadMessage() func in the readHandler func). Any ideas on how would I work around this issue? I would really appreciate anyone taking the time to look through it. Thanks in advance!
package main
import (
"context"
"fmt"
"os"
"time"
"os/signal"
"syscall"
"sync"
"net/url"
"github.com/gorilla/websocket"
"strconv"
"encoding/json"
"log"
"bytes"
"compress/gzip"
"io/ioutil"
)
// Structs
type Ping struct {
Ping int64 `json:"ping"`
}
type Pong struct {
Pong int64 `json:"pong"`
}
type SubParams struct {
Sub string `json:"sub"`
ID string `json:"id"`
}
func InitSub(subType string, pair string, i int) []byte {
var idInt string = "id" + strconv.Itoa(i)
subStr := "market." + pair + "." + subType
sub := &SubParams{
Sub: subStr,
ID: idInt,
}
out, err := json.MarshalIndent(sub, "", " ")
if err != nil {
log.Println(err);
}
//log.Println(string(out))
return out
}
// main func
func main() {
var server string = "api.huobi.pro"
pairs := []string{"btcusdt", "ethusdt", "ltcusdt"}
comms := make(chan os.Signal, 1)
signal.Notify(comms, os.Interrupt, syscall.SIGTERM)
ctx := context.Background()
ctx, cancel := context.WithCancel(ctx)
var wg sync.WaitGroup
for x, pair := range pairs {
wg.Add(1)
go control(server, "ws", pair, ctx, &wg, x+1)
}
<-comms
cancel()
wg.Wait()
}
func control(server string, path string, pair string, ctx context.Context, wg *sync.WaitGroup, i int) {
fmt.Printf("Started control for %s\n", server)
url := url.URL {
Scheme: "wss",
Host: server,
Path: path,
}
fmt.Println(url.String())
conn, _, err := websocket.DefaultDialer.Dial(url.String(), nil)
if err != nil {
panic(err)
}
subscribe(conn, pair, i)
defer conn.Close()
var localwg sync.WaitGroup
localwg.Add(1)
go readHandler(ctx, conn, &localwg, server)
<- ctx.Done()
localwg.Wait()
wg.Done()
return
}
func readHandler(ctx context.Context, conn *websocket.Conn, wg *sync.WaitGroup, server string) {
for {
select {
case <- ctx.Done():
wg.Done()
return
default:
_, p, err := conn.ReadMessage()
if err != nil {
wg.Done()
fmt.Println(err)
}
r, err := gzip.NewReader(bytes.NewReader(p))
if(err == nil) {
result, err := ioutil.ReadAll(r)
if(err != nil) {
fmt.Println(err)
}
d := string(result)
fmt.Println(d)
var ping Ping
json.Unmarshal([]byte(d), &ping)
if (ping.Ping > 0) {
str := Pong{Pong: ping.Ping}
msg, err := json.Marshal(str)
if (err == nil) {
fmt.Println(string(msg))
conn.WriteMessage(websocket.TextMessage, []byte(msg))
}
}
}
}
}
}
func subscribe(conn *websocket.Conn, pair string, id int) {
sub := string(InitSub("trade.detail", pair, id))
err := conn.WriteMessage(websocket.TextMessage, []byte(sub))
if err != nil {
panic(err)
}
}
Break out of the readHandler loop when the connection fails:
_, p, err := conn.ReadMessage()
if err != nil {
wg.Done()
fmt.Println(err)
return // <--- add this line
}
Without the return, the function spins in a tight loop reading errors until the panic.
Use defer wg.Done() at the beginning of the goroutine to ensure that Done is called exactly once.
func readHandler(ctx context.Context, conn *websocket.Conn, wg *sync.WaitGroup, server string) {
defer wg.Done()
for {
select {
case <-ctx.Done():
return
default:
_, p, err := conn.ReadMessage()
if err != nil {
fmt.Println(err)
return
}
...
Update the control function also.
Because the caller does not execute any code concurrently with readHander, there's no value in running readHandler is a goroutine. Remove all references to wait groups from readHandler and call the function directly: change go readHandler(ctx, conn, &localwg, server) to readHandler(ctx, conn, server).
There are more issues, but this should move you further along.

Go is not writing complete data to text file

I am trying to explore Go concurrency. Here Grabber() prints and writes the result of the execution. The program prints the expected result, but does not write it to urls.txt. Can anyone explain to me what i am missing here?
main.go
package main
import (
"bufio"
"fmt"
"io/ioutil"
"log"
"net/http"
"os"
"regexp"
"strings"
"sync"
)
var wg sync.WaitGroup
var mt sync.Mutex
// Final Literation
func main() {
file, err := os.Open("ip.txt")
if err != nil {
log.Fatal(err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
go Grabber(scanner.Text())
wg.Add(1)
}
wg.Wait()
if err := scanner.Err(); err != nil {
log.Fatal(err)
}
}
// stringInArray do If string in list return true false otherwise.
func stringInArray(a string, list []string) bool {
for _, b := range list {
if b == a {
return true
}
}
return false
}
// Grabber Do Search the bing and collect array of sitelist
func Grabber(ip string) {
defer wg.Done()
var output []string
outfile, err := os.Create("urls.txt")
if err != nil {
log.Fatal(err)
}
defer outfile.Close()
if ip == "" {
}
page := 1
for page < 251 {
client := &http.Client{}
req, err := http.NewRequest(
http.MethodGet,
fmt.Sprintf(
"http://www.bing.com/search?q=ip:%s+&count=50&first=1",
ip,
),
nil,
)
if err != nil {
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:57.0) Gecko/20100101 Firefox/57.0")
res, err := client.Do(req)
if err != nil {
fmt.Println("Invalid Request")
}
defer res.Body.Close()
body, err := ioutil.ReadAll(res.Body)
if err != nil {
fmt.Println("Couldn't Read")
}
re := regexp.MustCompile(`<h2><a href="(.*?)"`)
links := re.FindAllString(string(body), -1)
if links != nil {
for l := range links {
o := strings.Split(links[l], `"`)
d := strings.Split(o[1], "/")
s := d[0] + "//" + d[2]
if !stringInArray(s, output) {
output = append(output, s)
}
}
}
page = page + 50
}
for _, links := range output {
fmt.Println(links)
fmt.Fprintln(outfile, links)
}
}
Ip.txt as input
103.253.145.129
103.253.146.125
103.253.146.239
103.253.147.72
146.185.176.79
146.185.176.45
146.185.179.250
146.185.180.35
146.185.180.185
146.185.180.113
146.185.181.51
146.185.183.107
146.185.183.202
146.185.183.248
146.185.183.219
146.185.184.69
146.185.185.169
git repo URLGrabber
You are calling create in each goroutine, which will truncate the file. Instead, create the file outside, and serialize the writes to it using another goroutine:
outfile, err := os.Create("urls.txt")
results:=make(chan []string)
go func() {
for output:=range results {
for _, links := range output {
fmt.Fprintln(outfile, links)
}
}
}()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
go Grabber(scanner.Text(), results)
wg.Add(1)
}
wg.Wait()
close(results)
When you get the results in Grabber, instead of writing it to the file, write it to the channel:
results<-output
for _, links := range output {
fmt.Println(links)
}

Why does this Golang app use more memory the longer it runs?

I made this to monitor a few websites and notify me if one of them goes down. I'm testing it on just two urls. When it starts it uses about 5mb of memory (I checked with systemctl status monitor). After 40 minutes, it's using 7.4mb. After 8 hours, it uses over 50mb of memory. Why is it doing this? Is this called a memory leak?
package main
import (
"fmt"
"io/ioutil"
"net/http"
"os"
"sync"
"time"
"monitor/utils/slack"
"gopkg.in/yaml.v2"
)
var config struct {
Frequency int
Urls []string
}
type statusType struct {
values map[string]int
mux sync.Mutex
}
var status = statusType{values: make(map[string]int)}
func (s *statusType) set(url string, value int) {
s.mux.Lock()
s.values[url] = value
s.mux.Unlock()
}
func init() {
data, err := ioutil.ReadFile("config.yaml")
if err != nil {
fmt.Printf("Invalid config: %s\n", err)
os.Exit(0)
}
err = yaml.Unmarshal(data, &config)
if err != nil {
fmt.Printf("Invalid config: %s\n", err)
os.Exit(0)
}
for _, url := range config.Urls {
status.set(url, 200)
}
}
func main() {
ticker := time.NewTicker(time.Duration(config.Frequency) * time.Second)
for _ = range ticker.C {
for _, url := range config.Urls {
go check(url)
}
}
}
func check(url string) {
res, err := http.Get(url)
if err != nil {
res = &http.Response{StatusCode: 500}
}
// the memory problem occurs when this condition is never satisfied, so I didn't post the slack package.
if res.StatusCode != status.values[url] {
status.set(url, res.StatusCode)
err := slack.Alert(url, res.StatusCode)
if err != nil {
fmt.Println(err)
}
}
}
If this belongs in Code Review then I will put it there.
Yes, this is a memory leak. One obvious source I can spot is that you're not closing the response bodies from your requests:
func check(url string) {
res, err := http.Get(url)
if err != nil {
res = &http.Response{StatusCode: 500}
} else {
defer res.Body.Close() // You need to close the response body!
}
if res.StatusCode != status.values[url] {
status.set(url, res.StatusCode)
err := slack.Alert(url, res.StatusCode)
if err != nil {
fmt.Println(err)
}
}
}
Better still, so that Go can use keepalive, you want to read the full body and close it:
defer func() {
io.Copy(ioutil.Discard, res.Body)
res.Body.Close()
}()
You can further analyse where memory usage is coming from by profiling your application with pprof. There's a good rundown on the Go blog and a web search will turn up many more articles on the topic.

How to measure both before and after byte size and time to compress

I want to gzip a string (it is actually a JSON response)
var b bytes.Buffer
gz := gzip.NewWriter(&b)
if _, err := gz.Write([]byte("YourDataHere")); err != nil {
panic(err)
}
How can I easily output the size of bytes before and after compression and more importantly how can I time it takes to compress and decompress back to a string?
You can calculate the size as per Nipun Talukdar's comment.
len([]byte("YourDataHere"))
b.Len()
And use time.Now() and time.Since() to get the time taken.
var b bytes.Buffer
input := []byte("YourDataHere")
fmt.Println("Input size : ", len(input))
gz := gzip.NewWriter(&b)
start := time.Now()
gz.Write(input)
if _, err := gz.Flush(); err != nil {
panic(err)
}
totalTime := time.Since(start)
fmt.Println("Compressed size : ", b.Len(), "\nTime taken : ", totalTime)
gz.Close()
Same method can be applied with unzipping.
You can also create a support function that can do the timing.
func timer(startTime time.Time) {
totalTime := time.Since(startTime)
log.Println("Time taken : ",totalTime)
}
Usage : defer timer(time.Now()) at the start of the function.
There are examples of how to do this in Go here
https://golang.org/test/bench/go1/gzip_test.go
Thankfully it's BSD licensed...
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This benchmark tests gzip and gunzip performance.
package go1
import (
"bytes"
gz "compress/gzip"
"io"
"io/ioutil"
"testing"
)
var (
jsongunz = bytes.Repeat(jsonbytes, 10)
jsongz []byte
)
func init() {
var buf bytes.Buffer
c := gz.NewWriter(&buf)
c.Write(jsongunz)
c.Close()
jsongz = buf.Bytes()
}
func gzip() {
c := gz.NewWriter(ioutil.Discard)
if _, err := c.Write(jsongunz); err != nil {
panic(err)
}
if err := c.Close(); err != nil {
panic(err)
}
}
func gunzip() {
r, err := gz.NewReader(bytes.NewBuffer(jsongz))
if err != nil {
panic(err)
}
if _, err := io.Copy(ioutil.Discard, r); err != nil {
panic(err)
}
r.Close()
}
func BenchmarkGzip(b *testing.B) {
b.SetBytes(int64(len(jsongunz)))
for i := 0; i < b.N; i++ {
gzip()
}
}
func BenchmarkGunzip(b *testing.B) {
b.SetBytes(int64(len(jsongunz)))
for i := 0; i < b.N; i++ {
gunzip()
}
}
It's unfortunate that gzip.Writer can't report the number of compressed bytes written to the underlying stream. It gets more complicated when that underlying stream is not in-memory.
To solve this, I wrote a "counting io.Writer" that I place in between gzip.Writer and the underlying stream, so I can count and extract the number of compressed bytes written.
Try out the following code in the Go Playground.
package main
import (
"compress/gzip"
"fmt"
"io"
"os"
)
// countingWriter is an io.Writer that counts the total bytes written to it.
type countingWriter struct {
w io.Writer
Count int
}
var _ io.Writer = &countingWriter{}
func newCountingWriter(w io.Writer) *countingWriter {
return &countingWriter{w: w}
}
func (cw *countingWriter) Write(p []byte) (int, error) {
n, err := cw.w.Write(p)
cw.Count += n
return n, err
}
func ExampleUse(w io.Writer) (int, error) {
cw := newCountingWriter(w)
zw, err := gzip.NewWriterLevel(cw, gzip.BestCompression)
if err != nil {
return 0, err
}
if _, err := zw.Write([]byte("hello world")); err != nil {
return cw.Count, err
}
err = zw.Close()
return cw.Count, err
}
func main() {
n, err := ExampleUse(os.Stderr)
if err != nil {
panic(err)
}
fmt.Printf("wrote %d bytes\n", n)
}

Any better way to keep track of goroutine responses?

I'm trying to get my head around goroutines. I've created a simple program that performs the same search in parallel across multiple search engines. At the moment to keep track of the number of responses, I count the number I've received. It seems a bit amateur though.
Is there a better way of knowing when I've received a response from all of the goroutines in the following code?
package main
import (
"fmt"
"net/http"
"log"
)
type Query struct {
url string
status string
}
func search (url string, out chan Query) {
fmt.Printf("Fetching URL %s\n", url)
resp, err := http.Get(url)
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
out <- Query{url, resp.Status}
}
func main() {
searchTerm := "carrot"
fmt.Println("Hello world! Searching for ", searchTerm)
searchEngines := []string{
"http://www.bing.co.uk/?q=",
"http://www.google.co.uk/?q=",
"http://www.yahoo.co.uk/?q="}
out := make(chan Query)
for i := 0; i < len(searchEngines); i++ {
go search(searchEngines[i] + searchTerm, out)
}
progress := 0
for {
// is there a better way of doing this step?
if progress >= len(searchEngines) {
break
}
fmt.Println("Polling...")
query := <-out
fmt.Printf("Status from %s was %s\n", query.url, query.status)
progress++
}
}
Please use sync.WaitGroup, there is an example in the pkg doc
searchEngines := []string{
"http://www.bing.co.uk/?q=",
"http://www.google.co.uk/?q=",
"http://www.yahoo.co.uk/?q="}
var wg sync.WaitGroup
out := make(chan Query)
for i := 0; i < len(searchEngines); i++ {
wg.Add(1)
go func (url string) {
defer wg.Done()
fmt.Printf("Fetching URL %s\n", url)
resp, err := http.Get(url)
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
out <- Query{url, resp.Status}
}(searchEngines[i] + searchTerm)
}
wg.Wait()

Resources