I want to use goroutines to batch requests from different customers' with different date.
I mean 50 consumer goroutines to consume all customers from db, and 2 date consumer goroutines to consume date slice.
Main codes as below, but it hung and didn't exit as expected.
Why doesn't it exit as expected?
func Run(){
var syncWg sync.WaitGroup
syncWg.Add(1)
go SyncCustomerMetricsHistory(&syncWg)
syncWg.Wait()
}
func SyncCustomerMetricsHistory(wg *sync.WaitGroup){
defer wg.Done()
odb := orm.NewOrm()
start := time.Now()
logs.Info("start sync customer metrics, time:[%v]", start)
qs := odb.QueryTable("gg_customer")
var customers []*db.GgCustomer
if num, err := qs.All(&customers); err != nil || num == 0 {
logs.Error("Get customer error, rows:[%v], err:[%v]", num, err)
}
customersChan := make(chan *db.GgCustomer, 50)
var wgC sync.WaitGroup
wgC.Add(50)
for i := 0; i < 50; i++ {
go syncCustomerMetricsHistory(customersChan, &wgC)
}
go func() {
for _, customer := range customers {
customersChan <- customer
}
close(customersChan)
}()
wgC.Wait()
}
func syncCustomerMetricsHistory(customerChan <- chan *db.GgCustomer, wg *sync.WaitGroup){
defer wg.Done()
for customer := range customerChan{
dateChan := make(chan string, 2)
var wgD sync.WaitGroup
wgD.Add(2)
for i := 1; i < 2; i++{
go test(dateChan, customer, &wgD)
}
go func(){
for _, date := range GetAllYearDate(){
dateChan <- date
}
close(dateChan)
}()
wgD.Wait()
}
}
}
func test(dateChan <- chan string, customer *db.GgCustomer, wg *sync.WaitGroup){
defer wg.Done()
for date := range dateChan{
fmt.Println(date, customer)
}
}
func GetAllYearDate() []string{
return []string{"2019-10-01", "2019-10-02"}
}
I have not tried to run this (as it requires additional code) but believe your issue is:
wgD.Add(2)
for i := 1; i < 2; i++{
go test(dateChan, customer, &wgD)
}
That for loop will only iterate once but you called wgD.Add(2) (I think you probably meant the loop to iterate twice; try i <= 2).
One other bit of feedback; the way you are using waitgroups will work but is hard to follow (perhaps leading to you not spotting the issue); how about something like:
func Run(){
SyncCustomerMetricsHistory() // No wait group needed as this will not return before done
}
func SyncCustomerMetricsHistory(){
odb := orm.NewOrm()
start := time.Now()
logs.Info("start sync customer metrics, time:[%v]", start)
qs := odb.QueryTable("gg_customer")
var customers []*db.GgCustomer
if num, err := qs.All(&customers); err != nil || num == 0 {
logs.Error("Get customer error, rows:[%v], err:[%v]", num, err)
}
customersChan := make(chan *db.GgCustomer, 50)
var wgC sync.WaitGroup
wgC.Add(50)
for i := 0; i < 50; i++ {
go func() {
syncCustomerMetricsHistory(customersChan)
wgC.Done()
}()
}
go func() {
for _, customer := range customers {
customersChan <- customer
}
close(customersChan)
}()
wgC.Wait()
}
func syncCustomerMetricsHistory(customerChan <- chan *db.GgCustomer){
for customer := range customerChan{
dateChan := make(chan string, 2)
var wgD sync.WaitGroup
wgD.Add(2)
for i := 1; i < 2; i++{
go func() {
test(dateChan, customer)
wgD.Done()
}()
}
go func(){
for _, date := range GetAllYearDate(){
dateChan <- date
}
close(dateChan)
}()
wgD.Wait()
}
}
}
I think this is easier to follow because you can see where wg.Done() is being called. It's also really easy to stick some fmt.Println commands on either side which makes it simpler to debug this kind of issue.
Related
I have written producer-consumer pattern in golang. Reading multiple csv files and processing records. I am reading all records of csv file in one go.
I want to log percentage of processing completion in interval of 5% of total records including all csv files. for e.g I have 3 csv to process & each have 20,30,50 rows/records (so in total 100 records to process) want to log progress when 5 records are processed.
func processData(inputCSVFiles []string) {
producerCount := len(inputCSVFiles)
consumerCount := producerCount
link := make(chan []string, 100)
wp := &sync.WaitGroup{}
wc := &sync.WaitGroup{}
wp.Add(producerCount)
wc.Add(consumerCount)
for i := 0; i < producerCount; i++ {
go produce(link, inputCSVFiles[i], wp)
}
for i := 0; i < consumerCount; i++ {
go consume(link, wc)
}
wp.Wait()
close(link)
wc.Wait()
fmt.Println("Completed data migration process for all CSV data files.")
}
func produce(link chan<- []string, filePath string, wg *sync.WaitGroup) {
defer wg.Done()
records := readCsvFile(filePath)
totalNumberOfRecords := len(records)
for _, record := range records {
link <- record
}
}
func consume(link <-chan []string, wg *sync.WaitGroup) {
defer wg.Done()
for record := range link {
// process csv record
}
}
I have used atomic variable & counter channel where consumer will push count when record is processed & other goroutine will read from channel & calculate total processed record percentage.
var progressPercentageStep float64 = 5.0
var totalRecordsToProcess int32
func processData(inputCSVFiles []string) {
producerCount := len(inputCSVFiles)
consumerCount := producerCount
link := make(chan []string, 100)
counter := make(chan int, 100)
defer close(counter)
wp := &sync.WaitGroup{}
wc := &sync.WaitGroup{}
wp.Add(producerCount)
wc.Add(consumerCount)
for i := 0; i < producerCount; i++ {
go produce(link, inputCSVFiles[i], wp)
}
go progressStats(counter)
for i := 0; i < consumerCount; i++ {
go consume(link, wc)
}
wp.Wait()
close(link)
wc.Wait()
}
func produce(link chan<- []string, filePath string, wg *sync.WaitGroup) {
defer wg.Done()
records := readCsvFile(filePath)
atomic.AddInt32(&totalRecordsToProcess, int32(len(records)))
for _, record := range records {
link <- record
}
}
func consume(link <-chan []string,counter chan<- int, wg *sync.WaitGroup) {
defer wg.Done()
for record := range link {
// process csv record
counter <- 1
}
}
func progressStats(counter <-chan int) {
var feedbackThreshold = progressPercentageStep
for count := range counter {
totalRemaining := atomic.AddInt32(&totalRecordsToProcess, -count)
donePercent := 100.0 * processed / totalRemaining
// log progress
if donePercent >= feedbackThreshold {
log.Printf("Progress ************** Total Records: %d, Processed Records : %d, Processed Percentage: %.2f **************\n", totalRecordsToProcess, processed, donePercent)
feedbackThreshold += progressPercentageStep
}
}
}
I'm trying to fetch the content of an API with numerous goroutines.
I'm using a for loop to iterate over different character, but it seems like the forloop reaches its final value, before the requests are sent off.
package main
import (
"encoding/json"
"fmt"
"net/http"
"sync"
)
type people struct {
Name string `json:"name"`
}
func main(){
names := make(chan string, 25)
var wg sync.WaitGroup
for i := 0; i < 25; i++ {
wg.Add(1)
go func() {
defer wg.Done()
var p people
url := fmt.Sprintf("https://swapi.dev/api/people/%d", i)
getJSON(url, &p)
names <- p.Name
}()
}
name := <-names
fmt.Println(name)
wg.Wait()
}
func getJSON(url string, target interface{}) error {
r, err := http.Get(url)
if err != nil {
return err
}
defer r.Body.Close()
json.NewDecoder(r.Body).Decode(target)
return nil
}
Also, if somebody could improve my code quality, I'd be very grateful, I'm very new to Golang and don't have anybody to learn from!
You go routines are all using the same variable i. So on the first loop, you launch a goroutine that makes a url from i, and on the next loop i is incremented before that routine has a chance to run.
It's a common mistake in GoLang. The solution is to make a variable for each loop, and pass that one forward. You can either do it with a closure like this (playground).
for i := 0; i < 25; i++ {
wg.Add(1)
localI := i
go func() {
defer wg.Done()
var p people
// Use LocalI here
url := fmt.Sprintf("https://swapi.dev/api/people/%d", localI)
getJSON(url, &p)
names <- p.Name
}()
}
Or as an argument to the function (playground)
for i := 0; i < 25; i++ {
wg.Add(1)
localI := i
go func(localI int) {
defer wg.Done()
var p people
// Use LocalI here
url := fmt.Sprintf("https://swapi.dev/api/people/%d", localI)
getJSON(url, &p)
names <- p.Name
// Pass i here. Since I is a primitive, it is passed by value, not reference.
// Meaning a copy is made.
}(i)
}
Here is a good writeup on the mistake you made:
https://github.com/golang/go/wiki/CommonMistakes#using-goroutines-on-loop-iterator-variables
And the one above it is good to read too!
I've been experimenting with goroutines and channels, and I wanted to test the WaitGroup feature. Here I'm trying to execute an HTTP flood job, where the parent thread spawns a lot of goroutines which will make infinite requests, unless receiving a stop message:
func (hf *HTTPFlood) Run() {
childrenStop := make(chan int, hf.ConcurrentCalls)
stop := false
totalRequests := 0
requestsChan := make(chan int)
totalErrors := 0
errorsChan := make(chan int)
var wg sync.WaitGroup
for i := 0; i < hf.ConcurrentCalls; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for {
select {
case <-childrenStop:
fmt.Printf("stop child\n")
return
default:
_, err := Request(hf.Victim.String())
requestsChan <- 1
if err != nil {
errorsChan <- 1
}
}
}
}()
}
timeout := time.NewTimer(time.Duration(MaximumJobTime) * time.Second)
for !stop {
select {
case req := <- requestsChan:
totalReq += req
case err := <- errorsChan:
totalErrors += err
case <- timeout.C:
fmt.Printf("%s timed up\n", hf.Victim.String())
for i := 0; i < hf.ConcurrentCalls; i++ {
childrenStop <- 1
}
close(childrenStop)
stop = true
break
}
}
fmt.Printf("waiting\n")
wg.Wait()
fmt.Printf("after wait\n")
close(requestsChan)
close(errorsChan)
fmt.Printf("end\n")
}
Once timeout is fired, the parent thread successfully exits the loop and reaches the Wait instruction, but even though the stopChildren channel is filled, the child goroutines seem to never receive messages on the stopChildren channel.
What am I missing?
EDIT:
So the issue obviously was how the channels and its sends/receives were managed.
First of all the childrenStop channel was closed before all childs had received the message. The channel should be closed after the Wait
On the other hand, since no reads were done neither on requestsChan nor errorsChan once the parent thread sends the stop signal, most of the childs stayed blocked sending on these two channels. I tried to keep reading in the parent thread, outside the loop just before the Wait but that didn't work so I switched the implementation to Atomic counters which seem to be a more suitable way to manage this specific use case.
func (hf *HTTPFlood) Run() {
childrenStop := make(chan int, hf.ConcurrentCalls)
var totalReq uint64
var totalErrors uint64
var wg sync.WaitGroup
for i := 0; i < hf.ConcurrentCalls; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for {
select {
case <-childrenStop:
fmt.Printf("stop child\n")
return
default:
_, err := Request(hf.Victim.String())
atomic.AddUint64(&totalReq, 1)
if err != nil {
atomic.AddUint64(&totalErrors, 1)
}
}
}
}()
}
timeout := time.NewTimer(time.Duration(MaximumJobTime) * time.Second)
<- timeout.C
fmt.Printf("%s timed up\n", hf.Victim.String())
for i := 0; i < hf.ConcurrentCalls; i++ {
childrenStop <- 1
}
fmt.Printf("waiting\n")
wg.Wait()
fmt.Printf("after wait\n")
close(childrenStop)
fmt.Printf("end\n")
}
Your go routines can be is blocked at requestsChan <- 1.
case <- timeout.C:
fmt.Printf("%s timed up\n", hf.Victim.String())
for i := 0; i < hf.ConcurrentCalls; i++ {
childrenStop <- 1
}
close(childrenStop)
stop = true
break
Here you are sending a number to childrenStop and expect the go routines to receive it. But while you are sending the childrenStop signal, your routines could have sent something on requestsChan. But as you break from the loop after sending the close signals, there's no one listening on requestsChan to receive.
You can confirm this by printing something just before and after requestsChan <- 1 to confirm the behaviour.
A channel will block when you send something on it while no one is receiving on the other end
Here is a possible modification.
package main
import (
"fmt"
"time"
)
func main() {
requestsChan := make(chan int)
done := make(chan chan bool)
for i := 0; i < 5; i++ {
go func(it int) {
for {
select {
case c := <-done:
c <- true
return
default:
requestsChan <- it
}
}
}(i)
}
max := time.NewTimer(1 * time.Millisecond)
allChildrenDone := make(chan bool)
childrenDone := 0
childDone := make(chan bool)
go func() {
for {
select {
case i := <-requestsChan:
fmt.Printf("received %d;", i)
case <-max.C:
fmt.Println("\nTimeup")
for i := 0; i < 5; i++ {
go func() {
done <- childDone
fmt.Println("sent done")
}()
}
case <-childDone:
childrenDone++
fmt.Println("child done ", childrenDone)
if childrenDone == 5 {
allChildrenDone <- true
return
}
}
}
}()
fmt.Println("Waiting")
<-allChildrenDone
}
Thing to note here is that am sending the close signal in go routines so that the loop can continue while i wait for all the children to have cleanly exit.
Please watch this talk by Rob Pike which covers these details clearly.
[Edit]: The previous code would have resulted in a running routine after exiting.
I'm currently staring at a beefed up version of the following code:
func embarrassing(data []string) []string {
resultChan := make(chan string)
var waitGroup sync.WaitGroup
for _, item := range data {
waitGroup.Add(1)
go func(item string) {
defer waitGroup.Done()
resultChan <- doWork(item)
}(item)
}
go func() {
waitGroup.Wait()
close(resultChan)
}()
var results []string
for result := range resultChan {
results = append(results, result)
}
return results
}
This is just blowing my mind. All this is doing can be expressed in other languages as
results = parallelMap(data, doWork)
Even if it can't be done quite this easily in Go, isn't there still a better way than the above?
If you need all the results, you don't need the channel (and the extra goroutine to close it) to communicate the results, you can write directly into the results slice:
func cleaner(data []string) []string {
results := make([]string, len(data))
wg := &sync.WaitGroup{}
wg.Add(len(data))
for i, item := range data {
go func(i int, item string) {
defer wg.Done()
results[i] = doWork(item)
}(i, item)
}
wg.Wait()
return results
}
This is possible because slice elements act as distinct variables, and thus can be written individually without synchronization. For details, see Can I concurrently write different slice elements. You also get the results in the same order as your input for free.
Anoter variation: if doWork() would not return the result but get the address where the result should be "placed", and additionally the sync.WaitGroup to signal completion, that doWork() function could be executed "directly" as a new goroutine.
We can create a reusable wrapper for doWork():
func doWork2(item string, result *string, wg *sync.WaitGroup) {
defer wg.Done()
*result = doWork(item)
}
If you have the processing logic in such format, this is how it can be executed concurrently:
func cleanest(data []string) []string {
results := make([]string, len(data))
wg := &sync.WaitGroup{}
wg.Add(len(data))
for i, item := range data {
go doWork2(item, &results[i], wg)
}
wg.Wait()
return results
}
Yet another variation could be to pass a channel to doWork() on which it is supposed to deliver the result. This solution doesn't even require a sync.Waitgroup, as we know how many elements we want to receive from the channel:
func cleanest2(data []string) []string {
ch := make(chan string)
for _, item := range data {
go doWork3(item, ch)
}
results := make([]string, len(data))
for i := range results {
results[i] = <-ch
}
return results
}
func doWork3(item string, res chan<- string) {
res <- "done:" + item
}
"Weakness" of this last solution is that it may collect the result "out-of-order" (which may or may not be a problem). This approach can be improved to retain order by letting doWork() receive and return the index of the item. For details and examples, see How to collect values from N goroutines executed in a specific order?
You can also use reflection to achieve something similar.
In this example it distribute the handler function over 4 goroutines and returns the results in a new instance of the given source slice type.
package main
import (
"fmt"
"reflect"
"strings"
"sync"
)
func parralelMap(some interface{}, handle interface{}) interface{} {
rSlice := reflect.ValueOf(some)
rFn := reflect.ValueOf(handle)
dChan := make(chan reflect.Value, 4)
rChan := make(chan []reflect.Value, 4)
var waitGroup sync.WaitGroup
for i := 0; i < 4; i++ {
waitGroup.Add(1)
go func() {
defer waitGroup.Done()
for v := range dChan {
rChan <- rFn.Call([]reflect.Value{v})
}
}()
}
nSlice := reflect.MakeSlice(rSlice.Type(), rSlice.Len(), rSlice.Cap())
for i := 0; i < rSlice.Len(); i++ {
dChan <- rSlice.Index(i)
}
close(dChan)
go func() {
waitGroup.Wait()
close(rChan)
}()
i := 0
for v := range rChan {
nSlice.Index(i).Set(v[0])
i++
}
return nSlice.Interface()
}
func main() {
fmt.Println(
parralelMap([]string{"what", "ever"}, strings.ToUpper),
)
}
Test here https://play.golang.org/p/iUPHqswx8iS
I have 5 huge (4 million rows each) logfiles that I process in Perl currently and I thought I may try to implement the same in Go and its concurrent features. So, being very inexperienced in Go, I was thinking of doing as below. Any comments on the approach will be greatly appreciated.
Some rough pseudocode:
var wg1 sync.WaitGroup
var wg2 sync.WaitGroup
func processRow (r Row) {
wg2.Add(1)
defer wg2.Done()
res = <process r>
return res
}
func processFile(f File) {
wg1.Add(1)
open(newfile File)
defer wg1.Done()
line = <row from f>
result = go processRow(line)
newFile.Println(result) // Write new processed line to newFile
wg2.Wait()
newFile.Close()
}
func main() {
for each f logfile {
go processFile(f)
}
wg1.Wait()
}
So, idea is that I process these 5 files concurrently and then all rows of each file will in turn also be processed concurrently.
Will that work?
You should definitely use channels to manage your processed rows. Alternatively you could also write another goroutine to handle your output.
var numGoWriters = 10
func processRow(r Row, ch chan<- string) {
res := process(r)
ch <- res
}
func writeRow(f File, ch <-chan string) {
w := bufio.NewWriter(f)
for s := range ch {
_, err := w.WriteString(s + "\n")
}
func processFile(f File) {
outFile, err := os.Create("/path/to/file.out")
if err != nil {
// handle it
}
defer outFile.Close()
var wg sync.WaitGroup
ch := make(chan string, 10) // play with this number for performance
defer close(ch) // once we're done processing rows, we close the channel
// so our worker threads exit
fScanner := bufio.NewScanner(f)
for fScanner.Scan() {
wg.Add(1)
go func() {
processRow(fScanner.Text(), ch)
wg.Done()
}()
}
for i := 0; i < numGoWriters; i++ {
go writeRow(outFile, ch)
}
wg.Wait()
}
Here we have processRow doing all the processing (I assumed to string), writeRow doing all the out I/O, and processFile tying each file together. Then all main has to do is hand off the files, spawn the goroutines, et voila.
func main() {
var wg sync.WaitGroup
filenames := [...]string{"here", "are", "some", "log", "paths"}
for fname := range filenames {
inFile, err := os.Open(fname)
if err != nil {
// handle it
}
defer inFile.Close()
wg.Add(1)
go processFile(inFile)
}
wg.Wait()