Bulk insert speed gap when multiple machines are used - go

I have a go utility that reads documents from flat files and bulk loads them into couchbase. The application is able to insert data at a speed of up to 21K writes per second when 2 writer threads are being executed on a single machine (Destination is a remote couchbase cluster having one server within the network).
But When 2 writer threads are executed from 2 different machines (1 thread each), the insertion speed is reduced to half (10K writes/sec).
Since both the machines are using their own RAM and CPU for insertion, plus the utility has shown the speed of up to 25K writes per second, the network doesn’t seem to be the issue (I checked the network utilization as well and it is below 50 percent when multiple machines are used.)
Note: All machines that are used have i7 3.40GHz quad-core processor and 8GB RAM. The total amount of data being inserted is up to 500MB.
Bucket Configuration: 5.00GB RAM, Bucket disk I/O priority: High
I need to know what’s causing this speed gap. Please help…
Here is the Code:
package main
import (
"bufio"
"encoding/csv"
"fmt"
"io"
"log"
"strconv"
"os"
"sync"
"runtime"
"gopkg.in/couchbase/gocb.v1"
)
var (
bucket *gocb.Bucket
CB_Host string
)
func main() {
var wg sync.WaitGroup
CB_Host = <IP Address of Couchbase Server>
runtime.GOMAXPROCS(runtime.NumCPU())
cluster, err := gocb.Connect("couchbase://" + CB_Host) //..........Establish Couchbase Connection
if err != nil {
fmt.Println("ERROR CONNECTING COUCHBASE:", err)
}
bucket, err = cluster.OpenBucket("BUCKET", "*******")
if err != nil {
fmt.Println("ERROR OPENING BUCKET:", err)
}
Path := "E:\\Data\\File_" \\Path of the text file that contains data
for i := 1; i <= 2; i++{
wg.Add(1)
go InsertDataFromFile(Path+strconv.Itoa(i)+".txt", i, &wg)
}
wg.Wait()
err = bucket.Close() //.............. Close Couchbase Connection
if err != nil {
fmt.Println("ERROR CLOSING COUCHBASE CONNECTION:", err)
}
}
/*-- Main function Ends Here --*/
func InsertDataFromFile(Path string, i int, wg *sync.WaitGroup) (){
var (
ID string
JSONData string
items []gocb.BulkOp
)
csvFile, _ := os.Open(FilePath) //...............Open flat file containing data
reader := csv.NewReader(bufio.NewReader(csvFile))
reader.Comma = '$'
reader.LazyQuotes = true
counter := 1
fmt.Println("Starting Insertion of File "+ strconv.Itoa(i) + "...")
for {
line, error := reader.Read()
if error == io.EOF {
break
} else if error != nil { //...............Parse data and append it into items[] array
log.Fatal(error)
}
ID = line[0]
JSONData = line[1]
items = append(items, &gocb.UpsertOp{Key: ID, Value: JSONData})
if counter % 500 == 0 {
BulkInsert(&items) //................Bulk Insert Next 500 Documents Data into couchbase
items = nil
}
counter = counter + 1
}
BulkInsert(&items) //................Insert remaining documents
items = nil
fmt.Println("Insertion of File "+ strconv.Itoa(i) + " Completed...")
wg.Done()
}
func BulkInsert(item *[]gocb.BulkOp) (){
err := bucket.Do(*item)
if err != nil {
fmt.Println("ERROR PERFORMING BULK INSERT:", err)
}
}

Related

Rate limited with the response MY_CONTACTS_OVERFLOW_COUNT

I am proto-typing an application using Google new People API. During my testing I have added and deleted contacts in batches, to see how many can be added per minute and per day in total.
I understand the documentation say how many can be added per minute, but from my testing I don't seem to get anywhere close to this. Even when reviewing my metrics, my request is far long than the supposed limits per minute and per day.
My main question I have is after a couple of attempts across a service account on 3 of my gmail account's I am now getting back googleapi: Error 429: MY_CONTACTS_OVERFLOW_COUNT, rateLimitExceeded. I can't find any mention of MY_CONTACTS_OVERFLOW_COUNT online. I assumed from the error it meant I have too many contacts, but when running a delete script it appears I don't have any at all.
This response is returned for all 3 accounts on my development machine now for longer than 24 hours, which is making me believe I have possibly been blocked and not rate limited?
Client code for running the test:
package main
import (
"context"
"log"
"google.golang.org/api/people/v1"
"os"
"bufio"
"time"
//"github.com/davecgh/go-spew/spew"
)
func chunks(xs []string, chunkSize int) [][]string {
if len(xs) == 0 {
return nil
}
divided := make([][]string, (len(xs)+chunkSize-1)/chunkSize)
prev := 0
i := 0
till := len(xs) - chunkSize
for prev < till {
next := prev + chunkSize
divided[i] = xs[prev:next]
prev = next
i++
}
divided[i] = xs[prev:]
return divided
}
func main(){
ctx := context.Background()
srv, err := people.NewService(ctx)
if err != nil {
log.Fatalf("Unable to create people Client %v", err)
}
file, err := os.Open("test125k.txt")
if err != nil {
log.Fatalf("failed opening file: %s", err)
}
scanner := bufio.NewScanner(file)
scanner.Split(bufio.ScanLines)
var txtlines []string
for scanner.Scan() {
txtlines = append(txtlines, scanner.Text())
}
chunkEmails := chunks(txtlines,200)
count := 0
var validPeopleResources []string
log.Printf("Started")
for i,chunk := range chunkEmails{ //
var contacts people.BatchCreateContactsRequest
contacts.ReadMask = "emailAddresses,photos"
for _,chunkEmail := range chunk{
var contact people.ContactToCreate
var person people.Person
var personEmails people.EmailAddress
personEmails.Value = chunkEmail
var AllEmails = [](*people.EmailAddress){
&personEmails,
}
person.EmailAddresses = AllEmails
contact.ContactPerson = &person
contacts.Contacts = append(contacts.Contacts, &contact)
}
r,err := srv.People.BatchCreateContacts(&contacts).Do()
if err != nil {
log.Printf("Unable to create contacts")
log.Printf(err.Error())
log.Fatalf("")
}
var contactEmail string
var resource string
for _, validPeople := range r.CreatedPeople {
contactEmail = validPeople.Person.EmailAddresses[0].Value
resource = validPeople.Person.ResourceName
validPeopleResources = append(validPeopleResources,resource)
}
count = count + 1
if count == 2 {
var contactToDelete people.BatchDeleteContactsRequest
contactToDelete.ResourceNames = validPeopleResources
_,err = srv.People.BatchDeleteContacts(&contactToDelete).Do()
if err != nil {
log.Printf("Unable to delete contacts")
log.Printf(err.Error())
log.Fatalf("")
}
validPeopleResources = nil
count = 0
log.Printf("performed delete")
}
log.Printf("%d comlpeted",i)
time.Sleep(10 * time.Second)
}
}
"MY_CONTACTS_OVERFLOW_COUNT" happens when you try to insert new contacts to the Google account, but they already have the maximum number of contacts.
The max limit is 25,000, since 2011: https://workspaceupdates.googleblog.com/2011/05/need-more-contacts-in-gmail-contacts.html
It should be noted here that Google also takes the deleted contacts into account for calculating the count. You can find the deleted contacts in the Contacts Trash. These contacts will be cleared after 30 days.

How to make concurrent GET requests from url pool

I completed the suggested go-tour, watched some tutorials and gopher-conferences on YouTube. And that's pretty much it.
I have a project which requires me to send get requests and store the results in files. But amount of URL's is around 80 million.
I'm testing with 1000 URLs only.
Problem: I think I couldn't managed to make it concurrent, although I've followed some guidelines. I don't know what's wrong. But maybe I'm wrong and it's concurrent, just did not seem fast to me, the speed felt like sequential requests.
Here is the code I've written:
package main
import (
"bufio"
"io/ioutil"
"log"
"net/http"
"os"
"sync"
"time"
)
var wg sync.WaitGroup // synchronization to wait for all the goroutines
func crawler(urlChannel <-chan string) {
defer wg.Done()
client := &http.Client{Timeout: 10 * time.Second} // single client is sufficient for multiple requests
for urlItem := range urlChannel {
req1, _ := http.NewRequest("GET", "http://"+urlItem, nil) // generating the request
req1.Header.Add("User-agent", "Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/74.0") // changing user-agent
resp1, respErr1 := client.Do(req1) // sending the prepared request and getting the response
if respErr1 != nil {
continue
}
defer resp1.Body.Close()
if resp1.StatusCode/100 == 2 { // means server responded with 2xx code
text1, readErr1 := ioutil.ReadAll(resp1.Body) // try to read the sourcecode of the website
if readErr1 != nil {
log.Fatal(readErr1)
}
f1, fileErr1 := os.Create("200/" + urlItem + ".txt") // creating the relative file
if fileErr1 != nil {
log.Fatal(fileErr1)
}
defer f1.Close()
_, writeErr1 := f1.Write(text1) // writing the sourcecode into our file
if writeErr1 != nil {
log.Fatal(writeErr1)
}
}
}
}
func main() {
file, err := os.Open("urls.txt") // the file containing the url's
if err != nil {
log.Fatal(err)
}
defer file.Close() // don't forget to close the file
urlChannel := make(chan string, 1000) // create a channel to store all the url's
scanner := bufio.NewScanner(file) // each line has another url
for scanner.Scan() {
urlChannel <- scanner.Text()
}
close(urlChannel)
_ = os.Mkdir("200", 0755) // if it's there, it will create an error, and we will simply ignore it
for i := 0; i < 10; i++ {
wg.Add(1)
go crawler(urlChannel)
}
wg.Wait()
}
My question is: why is this code not working concurrently? How can I solve the problem I've mentioned above. Is there something that I'm doing wrong for making concurrent GET requests?
Here's some code to get you thinking. I put the URLs in the code so it is self-sufficient, but you'd probably be piping them to stdin in practice. There's a few things I'm doing here that I think are improvements, or at least worth thinking about.
Before we get started, I'll point out that I put the complete url in the input stream. For one thing, this lets me support http and https both. I don't really see the logic behind hard coding the scheme in the code rather than leaving it in the data.
First, it can handle arbitrarily sized response bodies (your version reads the body into memory, so it is limited by some number of concurrent large requests filling memory). I do this with io.Copy().
[edited]
text1, readErr1 := ioutil.ReadAll(resp1.Body) reads the entire http body. If the body is large, it will take up lots of memory. io.Copy(f1,resp1.Body) would instead copy the data from the http response body directly to the file, without having to hold the whole thing in memory. It may be done in one Read/Write or many.
http.Response.Body is an io.ReadCloser because the HTTP protocol expects the body to be read progressively. http.Response does not yet have the entire body, until it is read. That's why it's not just a []byte. Writing it to the filesystem progressively while the data "streams" in from the tcp socket means that a finite amount of system resources can download an unlimited amount of data.
But there's even more benefit. io.Copy will call ReadFrom() on the file. If you look at the linux implementation (for example): https://golang.org/src/os/readfrom_linux.go , and dig a bit, you'll see it actually uses copy_file_range That system call is cool because
The copy_file_range() system call performs an in-kernel copy between two file descriptors without the additional cost of transferring data from the kernel to user space and then back into the kernel.
*os.File knows how to ask the kernel to deliver data directly from the tcp socket to the file without your program even having to touch it.
See https://golang.org/pkg/io/#Copy.
Second, I make sure to use all the url components in the filename. URLs with different query strings go to different files. The fragment probably doesn't differentiate response bodies, so including that in the path may be ill considered. There's no awesome heuristic for turning URLs into valid file paths - if this were a serious task, I'd probably store the data in files based on a shasum of the url or something - and create an index of results stored in a metadata file.
Third, I handle all errors. req1, _ := http.NewRequest(... might seem like a convenient shortcut, but what it really means is that you won't know the real cause of any errors - at best. I usually add some descriptive text to the errors when percolating up, to make sure I can easily tell which error I'm returning.
Finally, I return successfully processed URLs so that I can see the final results. When scanning millions of URLS, you'd probably also want a list of which failed, but a count of successful is a good start at sending final data back for summary.
package main
import (
"bufio"
"bytes"
"fmt"
"io"
"log"
"net/http"
"net/url"
"os"
"path/filepath"
"time"
)
const urls_text = `http://danf.us/
https://farrellit.net/?3=2&#1
`
func crawler(urls <-chan *url.URL, done chan<- int) {
var processed int = 0
defer func() { done <- processed }()
client := http.Client{Timeout: 10 * time.Second}
for u := range urls {
if req, err := http.NewRequest("GET", u.String(), nil); err != nil {
log.Printf("Couldn't create new request for %s: %s", u.String(), err.Error())
} else {
req.Header.Add("User-agent", "Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/74.0") // changing user-agent
if res, err := client.Do(req); err != nil {
log.Printf("Failed to get %s: %s", u.String(), err.Error())
} else {
filename := filepath.Base(u.EscapedPath())
if filename == "/" || filename == "" {
filename = "response"
} else {
log.Printf("URL Filename is '%s'", filename)
}
destpath := filepath.Join(
res.Status, u.Scheme, u.Hostname(), u.EscapedPath(),
fmt.Sprintf("?%s",u.RawQuery), fmt.Sprintf("#%s",u.Fragment), filename,
)
if err := os.MkdirAll(filepath.Dir(destpath), 0755); err != nil {
log.Printf("Couldn't create directory %s: %s", filepath.Dir(destpath), err.Error())
} else if f, err := os.OpenFile(destpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644); err != nil {
log.Printf("Couldn't open destination file %s: %s", destpath, err.Error())
} else {
if b, err := io.Copy(f, res.Body); err != nil {
log.Printf("Could not copy %s body to %s: %s", u.String(), destpath, err.Error())
} else {
log.Printf("Copied %d bytes from body of %s to %s", b, u.String(), destpath)
processed++
}
f.Close()
}
res.Body.Close()
}
}
}
}
const workers = 3
func main() {
urls := make(chan *url.URL)
done := make(chan int)
var submitted int = 0
var inputted int = 0
var successful int = 0
for i := 0; i < workers; i++ {
go crawler(urls, done)
}
sc := bufio.NewScanner(bytes.NewBufferString(urls_text))
for sc.Scan() {
inputted++
if u, err := url.Parse(sc.Text()); err != nil {
log.Printf("Could not parse %s as url: %w", sc.Text(), err)
} else {
submitted++
urls <- u
}
}
close(urls)
for i := 0; i < workers; i++ {
successful += <-done
}
log.Printf("%d urls input, %d could not be parsed. %d/%d valid URLs successful (%.0f%%)",
inputted, inputted-submitted,
successful, submitted,
float64(successful)/float64(submitted)*100.0,
)
}
When setting up a concurrent pipeline, a good guideline to follow is to always first set up and instantiate the listeners that will execute concurrently (in your case, crawlers), and then start feeding them data through the pipeline (in your case, the urlChannel).
In your example, the only thing preventing a deadlock is the fact that you've instantiated a buffered channel with the same number of rows that your test file has (1000 rows). What the code does is it puts URLs inside the urlChannel. Since there are 1000 rows inside your file, the urlChannel can take all of them without blocking. If you put more URLs inside the file, the execution will block after filling up the urlChannel.
Here is the version of the code that should work:
package main
import (
"bufio"
"io/ioutil"
"log"
"net/http"
"os"
"sync"
"time"
)
func crawler(wg *sync.WaitGroup, urlChannel <-chan string) {
defer wg.Done()
client := &http.Client{Timeout: 10 * time.Second} // single client is sufficient for multiple requests
for urlItem := range urlChannel {
req1, _ := http.NewRequest("GET", "http://"+urlItem, nil) // generating the request
req1.Header.Add("User-agent", "Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/74.0") // changing user-agent
resp1, respErr1 := client.Do(req1) // sending the prepared request and getting the response
if respErr1 != nil {
continue
}
if resp1.StatusCode/100 == 2 { // means server responded with 2xx code
text1, readErr1 := ioutil.ReadAll(resp1.Body) // try to read the sourcecode of the website
if readErr1 != nil {
log.Fatal(readErr1)
}
resp1.Body.Close()
f1, fileErr1 := os.Create("200/" + urlItem + ".txt") // creating the relative file
if fileErr1 != nil {
log.Fatal(fileErr1)
}
_, writeErr1 := f1.Write(text1) // writing the sourcecode into our file
if writeErr1 != nil {
log.Fatal(writeErr1)
}
f1.Close()
}
}
}
func main() {
var wg sync.WaitGroup
file, err := os.Open("urls.txt") // the file containing the url's
if err != nil {
log.Fatal(err)
}
defer file.Close() // don't forget to close the file
urlChannel := make(chan string)
_ = os.Mkdir("200", 0755) // if it's there, it will create an error, and we will simply ignore it
// first, initialize crawlers
wg.Add(10)
for i := 0; i < 10; i++ {
go crawler(&wg, urlChannel)
}
//after crawlers are initialized, start feeding them data through the channel
scanner := bufio.NewScanner(file) // each line has another url
for scanner.Scan() {
urlChannel <- scanner.Text()
}
close(urlChannel)
wg.Wait()
}

Multiple serial requests result in empty buffer

The first TCP connection running on localhost on osx always parses the binary sent to it correctly. Subsequent requests lose the binary data, only seeing the first byte [8]. How have I failed to set up my Reader?
package main
import (
"fmt"
"log"
"net"
"os"
"app/src/internal/handler"
"github.com/golang-collections/collections/stack"
)
func main() {
port := os.Getenv("SERVER_PORT")
s := stack.New()
ln, err := net.Listen("tcp", ":8080")
if err != nil {
log.Fatalf("net.Listen: %v", err)
}
fmt.Println("Serving on " + port)
for {
conn, err := ln.Accept()
// defer conn.Close()
if err != nil {
log.Fatal("ln.Accept")
}
go handler.Handle(conn, s)
}
}
package handler
import (
"fmt"
"io"
"log"
"net"
"github.com/golang-collections/collections/stack"
)
func Handle(c net.Conn, s *stack.Stack) {
fmt.Printf("Serving %s\n", c.RemoteAddr().String())
buf := make([]byte, 0, 256)
tmp := make([]byte, 128)
n, err := c.Read(tmp)
if err != nil {
if err != io.EOF {
log.Fatalf("connection Read() %v", err)
}
return
}
buf = append(buf, tmp[:n]...)
}
log:
Serving [::1]:51699
------------- value ---------------:QCXhoy5t
Buffer Length: 9. First Value: 8
Serving [::1]:51700
------------- value ---------------:
Buffer Length: 1. First Value: 8
Serving [::1]:51701
test sent over:
push random string:
QCXhoy5t
push random string:
GPh0EnbS
push random string:
4kJ0wN0R
The docs for Reader say:
Read reads up to len(p) bytes into p. It returns the number of bytes read (0 <= n
<= len(p)) and any error encountered. Even if Read returns n < len(p), it may use
all of p as scratch space during the call. If some data is available but not
len(p) bytes, Read conventionally returns what is available instead of waiting
for more.
So the most likely cause of your issue is that Read is returning the data available (in this case a single character). You can fix this by using ioutil.ReadAll or performing the read in a loop (the fact the data is being added to a buffer makes it look like that was the original intention) with something like:
for {
n, err := c.Read(tmp)
if err != nil {
if err != io.EOF {
// Note that data might have also been received - you should process that
// if appropriate.
log.Fatalf("connection Read() %v", err)
return
}
break // All data received so process it
}
buf = append(buf, tmp[:n]...)
}
Note: There is no guarantee that any data is received; you should check the length before trying to access it (i.e. buf[0] may panic)

Farm out work to a slice but limit number of workers

I'm trying to improve the performance of an app.
One part of its code uploads a file to a server in chunks.
The original version simply does this in a sequential loop. However, it's slow and during the sequence it also needs to talk to another server before uploading each chunk.
The upload of chunks could simply be placed in a goroutine. It works, but is not a good solution because if the source file is extremely large it ends up using a large amount of memory.
So, I try to limit the number of active goroutines by using a buffered channel. Here is some code that shows my attempt. I've stripped it down to show the concept and you can run it to test for yourself.
package main
import (
"fmt"
"io"
"os"
"time"
)
const defaultChunkSize = 1 * 1024 * 1024
// Lets have 4 workers
var c = make(chan int, 4)
func UploadFile(f *os.File) error {
fi, err := f.Stat()
if err != nil {
return fmt.Errorf("err: %s", err)
}
size := fi.Size()
total := (int)(size/defaultChunkSize + 1)
// Upload parts
buf := make([]byte, defaultChunkSize)
for partno := 1; partno <= total; partno++ {
readChunk := func(offset int, buf []byte) (int, error) {
fmt.Println("readChunk", partno, offset)
n, err := f.ReadAt(buf, int64(offset))
if err != nil {
return n, err
}
return n, nil
}
// This will block if there are not enough worker slots available
c <- partno
// The actual worker.
go func() {
offset := (partno - 1) * defaultChunkSize
n, err := readChunk(offset, buf)
if err != nil && err != io.EOF {
return
}
err = uploadPart(partno, buf[:n])
if err != nil {
fmt.Println("Uploadpart failed:", err)
}
<-c
}()
}
return nil
}
func uploadPart(partno int, buf []byte) error {
fmt.Printf("Uploading partno: %d, buflen=%d\n", partno, len(buf))
// Actually upload the part. Lets test it by instead writing each
// buffer to another file. We can then use diff to compare the
// source and dest files.
// Open file. Seek to (partno - 1) * defaultChunkSize, write buffer
f, err := os.OpenFile("/home/matthewh/Downloads/out.tar.gz", os.O_CREATE|os.O_WRONLY, 0755)
if err != nil {
fmt.Printf("err: %s\n", err)
}
n, err := f.WriteAt(buf, int64((partno-1)*defaultChunkSize))
if err != nil {
fmt.Printf("err=%s\n", err)
}
fmt.Printf("%d bytes written\n", n)
defer f.Close()
return nil
}
func main() {
filename := "/home/matthewh/Downloads/largefile.tar.gz"
fmt.Printf("Opening file: %s\n", filename)
f, err := os.Open(filename)
if err != nil {
panic(err)
}
UploadFile(f)
}
It almost works. But there are several problems.
1) The final partno 22 is occuring 3 times. The correct length is actually 612545 as the file length isn't a multiple of 1MB.
// Sample output
...
readChunk 21 20971520
readChunk 22 22020096
Uploading partno: 22, buflen=1048576
Uploading partno: 22, buflen=612545
Uploading partno: 22, buflen=1048576
Another problem, the upload could fail and I am not familiar enough with go and how best to solve failure of the goroutine.
Finally, I want to ordinarily return some data from the uploadPart when it succeeds. Specifically, it'll be a string (an HTTP ETag header value). These etag values need to be collected by the main function.
What is a better way to structure this code in this instance? I've not yet found a good golang design pattern that correctly fulfills my needs here.
Skipping for the moment the question of how better to structure this code, I see a bug in your code which may be causing the problem you're seeing. Since the function you're running in the goroutine uses the variable partno, which changes with each iteration of the loop, your goroutine isn't necessarily seeing the value of partno at the time you invoked the goroutine. A common way of fixing this is to create a local copy of that variable inside the loop:
for partno := 1; partno <= total; partno++ {
partno := partno
// ...
}
Data race #1
Multiple goroutines are using the same buffer concurrently. Note that one gorouting may be filling it with a new chunk while another is still reading an old chunk from it. Instead, each goroutine should have it's own buffer.
Data race #2
As Andy Schweig has pointed, the value in partno is updated by the loop before the goroutine created in that iteration has a chance to read it. This is why the final partno 22 occurs multiple times. To fix it, you can pass partno as a argument to the anonymous function. That will ensure each goroutine has it's own part number.
Also, you can use a channel to pass the results from the workers. Maybe a struct type with the part number and error. That way, you will be able to observe the progress and retry failed uploads.
For an example of a good pattern check out this example from the GOPL book.
Suggested changes
As noted by dev.bmax buf moved into go routine, as noted by Andy Schweig partno is param to anon function, also added WaitGroup since UploadFile was exiting before uploads were complete. Also defer f.Close() file, good habit.
package main
import (
"fmt"
"io"
"os"
"sync"
"time"
)
const defaultChunkSize = 1 * 1024 * 1024
// wg for uploads to complete
var wg sync.WaitGroup
// Lets have 4 workers
var c = make(chan int, 4)
func UploadFile(f *os.File) error {
// wait for all the uploads to complete before function exit
defer wg.Wait()
fi, err := f.Stat()
if err != nil {
return fmt.Errorf("err: %s", err)
}
size := fi.Size()
fmt.Printf("file size: %v\n", size)
total := int(size/defaultChunkSize + 1)
// Upload parts
for partno := 1; partno <= total; partno++ {
readChunk := func(offset int, buf []byte, partno int) (int, error) {
fmt.Println("readChunk", partno, offset)
n, err := f.ReadAt(buf, int64(offset))
if err != nil {
return n, err
}
return n, nil
}
// This will block if there are not enough worker slots available
c <- partno
// The actual worker.
go func(partno int) {
// wait for me to be done
wg.Add(1)
defer wg.Done()
buf := make([]byte, defaultChunkSize)
offset := (partno - 1) * defaultChunkSize
n, err := readChunk(offset, buf, partno)
if err != nil && err != io.EOF {
return
}
err = uploadPart(partno, buf[:n])
if err != nil {
fmt.Println("Uploadpart failed:", err)
}
<-c
}(partno)
}
return nil
}
func uploadPart(partno int, buf []byte) error {
fmt.Printf("Uploading partno: %d, buflen=%d\n", partno, len(buf))
// Actually do the upload. Simulate long running task with a sleep
time.Sleep(time.Second)
return nil
}
func main() {
filename := "/home/matthewh/Downloads/largefile.tar.gz"
fmt.Printf("Opening file: %s\n", filename)
f, err := os.Open(filename)
if err != nil {
panic(err)
}
defer f.Close()
UploadFile(f)
}
I'm sure you can deal a little smarter with the buf situation. I'm just letting go deal with the garbage. Since you are limiting your workers to specific number 4 you really need only 4 x defaultChunkSize buffers. Please do share if you come up with something simple and shareworth.
Have fun!

Download from multiple sources in parallel using a goroutine

First I would like to say i've already viewed Golang download multiple files in parallel using goroutines and Example for sync.WaitGroup correct? and i've used them as a guide in my code. However I'm not certain that it's working for me. I'm trying to download files from multiple buckets on aws. This is what I have (some lines will be blank for security reasons).
package main
import (
"fmt"
"os"
"os/user"
"path/filepath"
"sync"
"time"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/s3"
"github.com/aws/aws-sdk-go/service/s3/s3manager"
)
var (
//Bucket = "" // Download from this bucket
Prefix = "" // Using this key prefix
LocalDirectory = "s3logs" // Into this directory
)
// create a single session to be used
var sess = session.New()
// used to control concurrency
var wg sync.WaitGroup
func main() {
start := time.Now()
//map of buckets to region
regBuckets := map[string]string{
}
// download the files for each bucket
for region, bucket := range regBuckets {
fmt.Println(region)
wg.Add(1)
go getLogs(region, bucket, LocalDirectory, &wg)
}
wg.Wait()
elapsed := time.Since(start)
fmt.Printf("\nTime took %s\n", elapsed)
}
// function to get data from buckets
func getLogs(region string, bucket string, directory string, wg *sync.WaitGroup) {
client := s3.New(sess, &aws.Config{Region: aws.String(region)})
params := &s3.ListObjectsInput{Bucket: &bucket, Prefix: &Prefix}
manager := s3manager.NewDownloaderWithClient(client, func(d *s3manager.Downloader) {
d.PartSize = 6 * 1024 * 1024 // 6MB per part
d.Concurrency = 5
})
d := downloader{bucket: bucket, dir: directory, Downloader: manager}
client.ListObjectsPages(params, d.eachPage)
wg.Done()
}
// downloader object and methods
type downloader struct {
*s3manager.Downloader
bucket, dir string
}
func (d *downloader) eachPage(page *s3.ListObjectsOutput, more bool) bool {
for _, obj := range page.Contents {
d.downloadToFile(*obj.Key)
}
return true
}
func (d *downloader) downloadToFile(key string) {
// Create the directories in the path
// desktop path
user, errs := user.Current()
if errs != nil {
panic(errs)
}
homedir := user.HomeDir
desktop := homedir + "/Desktop/" + d.dir
file := filepath.Join(desktop, key)
if err := os.MkdirAll(filepath.Dir(file), 0775); err != nil {
panic(err)
}
// Setup the local file
fd, err := os.Create(file)
if err != nil {
panic(err)
}
defer fd.Close()
// Download the file using the AWS SDK
fmt.Printf("Downloading s3://%s/%s to %s...\n", d.bucket, key, file)
params := &s3.GetObjectInput{Bucket: &d.bucket, Key: &key}
d.Download(fd, params)
_, e := d.Download(fd, params)
if e != nil {
panic(e)
}
}
In the regBuckets hashmap I place a list of bucket names : regions In the for loop below I print the bucket name. So If i have two buckets I want to download the items from both buckets at the same time. I was testing this with a print statement. I excepted to see the name of the first bucket and soon after the name of the second bucket. However it seems like instead of downloading the files from multiple buckets is parallel it's downloading them in order, e.g bucket 1 when bucket 1 is done the for loop continues and then bucket 2...etc So i need help making sure I'm downloading in parallel because I have roughly 10 buckets and speed is important. I also wonder if it's because i'm using a single session. Any idea?

Resources