func grabPage(i int, wg *sync.WaitGroup, buf *[]byte) {
defer wg.Done()
res, err := http.Get("https://en.wikipedia.org/wiki/Immanuel_Kant")
if err != nil {
log.Fatal(err)
}
f, err := os.Create(fmt.Sprintf("./data/%d.txt", i))
if err != nil {
log.Fatal(err)
}
_, err = io.CopyBuffer(f, res.Body, *buf)
if err != nil {
log.Fatal(err)
}
}
func main() {
f, _ := os.Create("cpuprofile")
pprof.StartCPUProfile(f)
defer pprof.StopCPUProfile()
runtime.GOMAXPROCS(4)
start := time.Now()
var wg sync.WaitGroup
total := 800
var buf []byte
wg.Add(total)
for index := 0; index < total; index++ {
go grabPage(index, &wg, &buf)
}
wg.Wait()
elapsed := time.Since(start)
log.Printf("took %s", elapsed)
}
I have a toy program and am simply grabbing an http request and writing it to a file. wget shows a latency of .2s and I am wondering how to get closer to this sort of speed (if possible). I have tried io.copy,fasthttp, bufio stream readers etc. which all were slower than the above. I just want to make sure there is nothing obvious I am missing that could speed this up as I am very new to Go. I liked fasthttp's concept of reading directly to a buffer but I think I mimicked that via io.CopyBuffer. here are my proff results :/
0 0% 0% 6.78s 68.00% io.CopyBuffer /usr/local/opt/go/libexec/src/io/io.go
0 0% 0% 6.78s 68.00% io.copyBuffer /usr/local/opt/go/libexec/src/io/io.go
0 0% 0% 5.44s 54.56% net/http.(*http2gzipReader).Read /usr/local/opt/go/libexec/src/net/http/h2_bundle.go
0 0% 0% 5.41s 54.26% compress/gzip.(*Reader).Read /usr/local/opt/go/libexec/src/compress/gzip/gunzip.go
0 0% 0% 5.35s 53.66% compress/flate.(*decompressor).Read /usr/local/opt/go/libexec/src/compress/flate/inflate.go
0.92s 9.23% 9.23% 5.20s 52.16% compress/flate.(*decompressor).huffmanBlock /usr/local/opt/go/libexec/src/compress/flate/inflate.go
2.97s 29.79% 39.02% 2.99s 29.99% syscall.Syscall /usr/local/opt/go/libexec/src/syscall/asm_darwin_amd64.s
0.01s 0.1% 39.12% 2.59s 25.98% internal/poll.(*FD).Write /usr/local/opt/go/libexec/src/internal/poll/fd_unix.go
1.02s 10.23% 49.35% 2.53s 25.38% compress/flate.(*decompressor).huffSym /usr/local/opt/go/libexec/src/compress/flate/inflate.go
Related
I am processing a huge data file which is approx. 100 GB. Each line in that huge file is a JSON piece of data which I'd like to read, compress, and store in an in memory database.
var wg sync.WaitGroup
for {
line, err := reader.ReadString('\n')
if err != nil {
break
}
go func(index int) {
wg.Add(1)
pr, pw := io.Pipe()
zw := lzw.NewWriter(pw, lzw.LSB, 8)
_, err := io.Copy(zw, strings.NewReader(line))
pw.Close()
zw.Close()
if err != nil {
fmt.Println(err.Error())
}
b, err := io.ReadAll(pr)
if err != nil {
fmt.Println(err.Error())
}
client.Set(ctx, fmt.Sprintf("%d", index), base64.StdEncoding.EncodeToString(b), time.Hour*1000)
pr.Close()
wg.Done()
}(index)
if index%10000 == 0 {
fmt.Println(index)
wg.Wait()
}
index += 1
}
However, this code stops after processing the first 10000 lines. When I move down the wg.Add(1) after the zw.Close() it keeps on processing the rest of the line (but becomes instable). Without the lzw and io.Pipe() when I try to store the exact values in an uncompressed way, then everything works without any issue.
I am not sure whether I am not using the WaitGroup correctly or there is something associated with the io.Pipe() which I am not aware of yet.
TLDR:
1- Removing pr, pw := io.Pipe() makes the code more simple, since it is superfluous,
try this:
line, err := reader.ReadString('\n')
if err == io.EOF {
wg.Wait()
break
}
if err != nil {
log.Fatal(err)
}
wg.Add(1)
go func(index int) {
var buf bytes.Buffer
{ // lexical scoping (static scoping)
zw := lzw.NewWriter(&buf, lzw.LSB, 8)
n, err := zw.Write([]byte(line)) // n, err := io.Copy(zw, strings.NewReader(line))
if err != nil {
log.Fatal(err)
}
if int(n) != len(line) {
log.Fatal(n, len(line))
}
// It is the caller's responsibility to call Close on the WriteCloser when finished writing.
if err = zw.Close(); err != nil {
log.Fatal(err)
}
}
ctx, cancelFunc := context.WithTimeout(context.Background(), 100*time.Millisecond)
client.Set(ctx, fmt.Sprintf("%d", index), base64.StdEncoding.EncodeToString(buf.Bytes()), 1000*time.Hour)
cancelFunc()
wg.Done()
}(index)
if index%tenThousand == 0 {
wg.Wait()
}
2- You need to put the wg.Add(1) before go func(index int) {:
wg.Add(1)
go func(index int) {
3- The wg.Wait() logic:
if index%10000 == 0 {
fmt.Println(index)
wg.Wait()
}
What happens for the last iteration if index%10000 != 0.
So here when err == io.EOF you need to wg.Wait() for all goroutines to join:
if err == io.EOF {
wg.Wait()
fmt.Println("\n**** All done **** index =", index)
break
}
4- You may use lexical scoping (static scoping) to limit some variables scope and make the code more manageable - and to know when to Close the lzw.NewWriter :
{ // lexical scoping (static scoping)
zw := lzw.NewWriter(bufio.NewWriter(&buf), lzw.LSB, 8)
n, err := io.Copy(zw, strings.NewReader(line))
if err != nil {
log.Fatal(err)
}
if int(n) != len(line) {
log.Fatal(n, len(line))
}
// It is the caller's responsibility to call Close on the WriteCloser when finished writing.
if err = zw.Close(); err != nil {
log.Fatal(err)
}
}
5- Always check the errors, e.g.:
if err = zw.Close(); err != nil {
log.Fatal(err)
}
This is the working version close to your code - try this just to experiment with concurrency logic to see what happens (not recommended since this has superfluous goroutines and io.Pipe - just working:
package main
import (
"bufio"
"compress/lzw"
"context"
"encoding/base64"
"fmt"
"io"
"log"
"strings"
"sync"
"time"
)
func main() {
index := 0
client := &myClient{}
reader := bufio.NewReader(file)
// your code:
var wg sync.WaitGroup
for {
index++
line, err := reader.ReadString('\n')
if err != nil {
msg <- fmt.Sprint(index, " Done not waiting with err: ", err, time.Now())
wg.Wait() // break waiting // if index%tenThousand != 0
break
}
wg.Add(1)
go func(i int) {
msg <- fmt.Sprint(i, " Enter running ... ", time.Now())
asyncReader, asyncWriter := io.Pipe() // make it async to read and write
zipWriter := lzw.NewWriter(asyncWriter, lzw.LSB, 8)
go func() { // async
_, err := io.Copy(zipWriter, strings.NewReader(line))
if err != nil {
log.Fatal(err)
}
_ = zipWriter.Close()
_ = asyncWriter.Close() // for io.ReadAll
}()
b, err := io.ReadAll(asyncReader)
if err != nil {
log.Fatal(err)
}
client.Set(context.Background(), fmt.Sprintf("%d", i), base64.StdEncoding.EncodeToString(b), time.Hour*1000)
asyncReader.Close()
time.Sleep(1 * time.Second)
msg <- fmt.Sprint(i, " Exit running ... ", time.Now())
wg.Done()
}(index)
msg <- fmt.Sprint(index, " ", index%tenThousand == 0, " after go call")
if index%tenThousand == 0 {
wg.Wait()
msg <- fmt.Sprint("..", index, " Done waiting after go call. ", time.Now())
}
}
msg <- "Bye forever."
wg.Wait()
close(msg)
wgMsg.Wait()
}
// just for the Go Playground:
const tenThousand = 2
type myClient struct {
}
func (p *myClient) Set(ctx context.Context, a, b string, t time.Duration) {
// fmt.Println("a =", a, ", b =", b, ", t =", t)
if ctx.Err() != nil {
fmt.Println(ctx.Err())
}
}
var file, myw = io.Pipe()
func init() {
go func() {
for i := 1; i <= tenThousand+1; i++ {
fmt.Fprintf(myw, "%d text to compress aaaaaaaaaaaaaa\n", i)
}
myw.Close()
}()
wgMsg.Add(1)
go func() {
defer wgMsg.Done()
for s := range msg {
fmt.Println(s)
}
}()
}
var msg = make(chan string, 100)
var wgMsg sync.WaitGroup
Output:
1 false after go call
2 true after go call
1 Enter running ... 2009-11-10 23:00:00 +0000 UTC m=+0.000000001
2 Enter running ... 2009-11-10 23:00:00 +0000 UTC m=+0.000000001
1 Exit running ... 2009-11-10 23:00:01 +0000 UTC m=+1.000000001
2 Exit running ... 2009-11-10 23:00:01 +0000 UTC m=+1.000000001
..2 Done waiting after go call. 2009-11-10 23:00:01 +0000 UTC m=+1.000000001
3 false after go call
3 Enter running ... 2009-11-10 23:00:01 +0000 UTC m=+1.000000001
4 Done not waiting with err: EOF 2009-11-10 23:00:01 +0000 UTC m=+1.000000001
3 Exit running ... 2009-11-10 23:00:02 +0000 UTC m=+2.000000001
Bye forever.
I am writing a cat with a timeout on receiving the first byte. I have it working except that it can't handle echo -n:
❯ echo -n | time possiblycat 1000 # 1000 is the timeout in milliseconds
possiblycat 1000 0.00s user 0.00s system 0% cpu 1.008 total; max RSS 1864
cat itself has no issues with this; It notices the EOF and exits immediately:
❯ echo -n | time cat
cat 0.00s user 0.00s system 71% cpu 0.003 total; max RSS 664
This is the whole source of possiblycat:
package main
import (
"io"
"io/ioutil"
"os"
"strconv"
"time"
)
func main() {
wait := 10
if len(os.Args) >= 2 {
waitDummy, err := strconv.Atoi(os.Args[1])
if err != nil {
panic(err)
}
wait = waitDummy
}
b := make(chan byte, 1)
go scan(b)
select {
case res := <-b:
inBytes, err := ioutil.ReadAll(os.Stdin)
if err != nil {
panic(err)
}
stdin := append([]byte{res}, inBytes...)
_, err2 := os.Stdout.Write(stdin)
if err2 != nil {
panic(err2)
}
case <-time.After(time.Duration(wait) * time.Millisecond):
os.Exit(1)
}
}
func scan(out chan byte) {
var b []byte = make([]byte, 1)
_, err := os.Stdin.Read(b)
if err == io.EOF {
return
} else if err != nil {
panic(err)
}
out <- b[0]
}
Related:
Does echo -n | … send an EOF to the pipe?
When os.Stdin.Read returns EOF, you exit the scan function which is running in its own goroutine.
However, nothing is being done to tell the main goroutine that all input has been processed. It is waiting for data on channel b, or for the timeout. Since there is no data coming on b, the timeout gets reached.
To properly handle this, the err == io.EOF case should signal the main goroutine that there is no more work to be done. A common pattern (but certainly not the only one) is to have a done channel indicating that all work is finished.
done := make(chan bool, 1)
go scan(b, done)
select {
case res := <-b:
...
case <-done:
os.Exit(1)
case <-time.After(time.Duration(wait) * time.Millisecond):
os.Exit(1)
}
}
func scan(out chan byte, done chan bool) {
var b []byte = make([]byte, 1)
_, err := os.Stdin.Read(b)
if err == io.EOF {
fmt.Println("got EOF, exiting")
done <- true
return
} else if err != nil {
...
}
Another (even simpler) alternative is to simply close the data channel when you're done:
func scan(out chan byte) {
var b []byte = make([]byte, 1)
_, err := os.Stdin.Read(b)
if err == io.EOF {
fmt.Println("got EOF, exiting")
close(out)
return
} else if err != nil {
panic(err)
}
out <- b[0]
}
Apache Batch is a popular pressure measuring tool.
I write a tool with the same function in golang myself. Then I make some tests.
I find it works well.
But accidentally, I find a difference from my tool and Apache Batch:
When checking ESTABLISHED link number by the cmd netstat -na|grep ESTABLISHED|wc -l:
Use cmd ab -n 128000 -c 128 http://127.0.0.1:8000/, the cmd above returns a number near 128.
But use my own tool, the cmd returns near 256 (when set concurrency 128).
Why my own tool have twice the expected number of concurrencies?
My code:
func request(cli *http.Client, uri string) {
resp, err := cli.Get(uri)
if err != nil {
panic(err)
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
panic("return" + strconv.Itoa(resp.StatusCode))
}
}
func workRoutine(ch chan string, wg *sync.WaitGroup) {
cli := &http.Client{}
for uri := range ch {
request(cli, uri)
}
wg.Done()
}
func main() {
rnum := 128
tnum := 128000
url := "http://127.0.0.1:8000/"
ch := make(chan string)
var wg sync.WaitGroup
go func() {
for i := 0; i < tnum; i++ {
ch <- url
}
close(ch)
}()
wg.Add(rnum)
for i := 0; i < rnum; i++ {
go workRoutine(ch, &wg)
}
stime := time.Now()
wg.Wait()
dtime := time.Now().Sub(stime)
fmt.Println("Timecost", dtime)
fmt.Println("Throughputs", float64(tnum)/dtime.Seconds())
}
I am quite new to golang.So, please spare me the sword ( if possible ).
I was trying to get data from the web by studying the tutorial here
Now, the tutorial goes all well, but I wanted to check for edge cases and error-handling ( just to be thorough with my new learning of the language, don't want to be the one with half-baked knowledge ).
Here's my go-playground code.
Before asking I looked at a lot of references like :
Go blog defer,panic and recover
handling panics in goroutines
how-should-i-write-goroutine
And a few more, however I couldn't figure it out much.
Here's the code in case you don't want to go to the playground ( for reasons yet unknown to man ) :
// MakeRequest : Makes requests concurrently
func MakeRequest(url string, ch chan<- string, wg *sync.WaitGroup) {
start := time.Now()
resp, err := http.Get(url)
defer func() {
resp.Body.Close()
wg.Done()
if r := recover(); r != nil {
fmt.Println("Recovered in f", r)
}
}()
if err != nil {
fmt.Println(err)
panic(err)
}
secs := time.Since(start).Seconds()
body, _ := ioutil.ReadAll(resp.Body)
ch <- fmt.Sprintf("%.2f elapsed with response length: %d %s", secs, len(body), url)
}
func main() {
var wg sync.WaitGroup
output := []string{
"https://www.facebook.com",
"",
}
start := time.Now()
ch := make(chan string)
for _, url := range output {
wg.Add(1)
go MakeRequest(url, ch, &wg)
}
for range output {
fmt.Println(<-ch)
}
fmt.Printf("%.2fs elapsed\n", time.Since(start).Seconds())
}
Update
I changed the code to ( let's say ) handle the error in goroutine like this ( go-playground here ):
func MakeRequest(url string, ch chan<- string, wg *sync.WaitGroup) {
start := time.Now()
resp, err := http.Get(url)
if err == nil {
secs := time.Since(start).Seconds()
body, _ := ioutil.ReadAll(resp.Body)
ch <- fmt.Sprintf("%.2f elapsed with response length: %d %s", secs, len(body), url)
// fmt.Println(err)
// panic(err)
}
defer wg.Done()
}
Update 2 :
After an answer I changed the code to this and it successfully removes the chan deadlock, however now I need to handle this in main :
func MakeRequest(url string, ch chan<- string, wg *sync.WaitGroup) {
defer wg.Done()
start := time.Now()
resp, err := http.Get(url)
if err == nil {
secs := time.Since(start).Seconds()
body, _ := ioutil.ReadAll(resp.Body)
ch <- fmt.Sprintf("%.2f elapsed with response length: %d %s", secs, len(body), url)
// fmt.Println(err)
// panic(err)
}
// defer resp.Body.Close()
ch <- fmt.Sprintf("")
}
Isn't there a more elegant way to handle this ?
But now I get locked up in deadlock.
Thanks and regards.
Temporarya
( a golang noobie )
You are using recover correctly. You have two problems:
You are using panic incorrectly. You should only panic when there was a programming error. Avoid using panics unless you believe taking down the program is a reasonable response to what happened. In this case, I would just return the error, not panic.
You are panicing during your panic. What is happening is that you are first panicing at panic(err). Then in your defer function, you are panicing at resp.Body.Close(). When http.Get returns an error, it returns a nil response. That means that resp.Body.Close() is acting on a nil value.
The idiomatic way to handle this would be something like the following:
func MakeRequest(url string, ch chan<- string, wg *sync.WaitGroup) {
defer wg.Done()
start := time.Now()
resp, err := http.Get(url)
if err != nil {
//handle error without panicing
}
// there was no error, so resp.Body is guaranteed to exist.
defer resp.Body.Close()
...
Response to update: Ifhttp.Get() returns an error, you never send on the channel. At some point all goroutines except the main goroutine stop running and the main goroutine is waiting on <-ch. Since that channel receive will never complete and there is nothing else for the Go runtime to schedule, it panics (unrecoverably).
Response to comment: To ensure the channel doesn't hang, you need some sort of coordination to know when messages will stop coming. How this is implemented depends on your real program, and an example cannot necessarily extrapolate to reality. For this example, I would simply close the channel when the WaitGroup is done.
Playground
func main() {
var wg sync.WaitGroup
output := []string{
"https://www.facebook.com",
"",
}
start := time.Now()
ch := make(chan string)
for _, url := range output {
wg.Add(1)
go MakeRequest(url, ch, &wg)
}
go func() {
wg.Wait()
close(ch)
}()
for val := range ch {
fmt.Println(val)
}
fmt.Printf("%.2fs elapsed\n", time.Since(start).Seconds())
}
To convert [][]byte to []string, I do this
data, err := ioutil.ReadFile("test.txt")
if err != nil {
return nil, err
}
db := bytes.Split(data, []uint8("\n"))
// Convert [][]byte to []string
s := make([]string, len(db))
for i, val := range db {
s[i] = string(val)
}
fmt.Printf("%v", s)
I am new to golang, I'm not sure is most efficient way to do this.
The most effective way would be to remove this step: db := bytes.Split(data, []uint8("\n")) and instead iterate over data like that:
func main() {
data, _ := ioutil.ReadFile("test.txt")
s := make([]string, 0)
start := 0
for i := range data {
if data[i] == '\n' {
elem := string(data[start : i-1])
s = append(s, elem)
start = i
}
}
fmt.Printf("%v", s)
}
Or if you want to convert [][]byte to []string:
func convert(data [][]byte) []string {
s := make([]string, len(data))
for row := range data {
s[row] = string(data[row])
}
return s
}
If you actually want to convert a file content to a []string, you can use bufio.Scanner which is cleaner (IMO) and more efficient than the code you posted:
func readFile(filename string) ([]string, error) {
file, err := os.Open(filename)
if err != nil {
return nil, err
}
defer file.Close()
scanner := bufio.NewScanner(file)
var data []string
for scanner.Scan() {
line := scanner.Text()
data = append(data, line)
}
if err = scanner.Err(); err != nil {
return nil, err
}
return data, nil
}
Here's a benchmark* comparing the original function (readFile1) and my function (readFile2):
BenchmarkReadFile1-8 300 4632189 ns/op 3035552 B/op 10570 allocs/op
BenchmarkReadFile2-8 1000 1695820 ns/op 2169655 B/op 10587 allocs/op
*the benchmark read a sample file of 1.2 MiB and ~10K lines
The new code runs in 36% of the time and 71% of the memory used by the original function.