Go routines started with for-loop - one or many channels? - go

I would like to load some json files (".json") using a goroutine called from a for-loop. I'd like to have the loading parallellized (processing first files while the other files are being loaded).
Q1. Since the numer of files may vary (new ones to be added), I would use a (file) list with filenames (autogenerating the names only in this example), therefore I'd like to use a for-loop. Optimal?
Q2. What would be the most effective use of channel(s).
Q3. How would I define the channel(s) if a unique channel for each load operation (as in the example code below) is needed?
Example code (to be compacted & capable of loading the files using a list of file names):
func load_json(aChan chan byte, s string) {
// load "filename" + s + ".json"
// confirm to the channel
aChan <- 0
}
func do_stuff() {
// .. with the newly loaded json
}
func Main() {
chan_A := make(chan byte)
go load_json(chan_A, "_classA")
chan_B := make(chan byte)
go load_json(chan_B, "_classB")
chan_C := make(chan byte)
go load_json(chan_C, "_classC")
chan_D := make(chan byte)
go load_json(chan_D, "_classD")
<-chan_A
// Now, do stuff with Class A
<-chan_B
// etc...
<-chan_C
<-chan_D
fmt.Println("Done.")
}
EDIT:
I designed a simplified test solution based on the ideas suggested by "Tom" (see below). In my case I splitted the task in three phases, using one channel per phase to control the execution. However, I tend to get deadlocks with this code (See execution results and the note below below the code).
Run this code on the PlayGround.
How can I avoid the deadlocks in this code?:
type TJsonFileInfo struct {
FileName string
}
type TChannelTracer struct { // Will count & display visited phases A, B, C
A, B, C int
}
var ChannelTracer TChannelTracer
var jsonFileList = []string{
"./files/classA.json",
"./files/classB.json",
"./files/classC.json",
}
func LoadJsonFiles(aFileName string, aResultQueueChan chan *TJsonFileInfo) {
var newFileInfo TJsonFileInfo
newFileInfo.FileName = aFileName
// file, e := ioutil.ReadFile(newFileInfo.FileName)...
ChannelTracer.A += 1
fmt.Printf("A. Loaded file: %s\n", newFileInfo.FileName)
aResultQueueChan <- &newFileInfo
}
func UnmarshalFile(aWorkQueueChan chan *TJsonFileInfo, aResultQueueChan chan *TJsonFileInfo) {
FileInfo := <-aWorkQueueChan
ChannelTracer.B += 1
fmt.Printf("B. Marshalled file: %s\n", FileInfo.FileName)
aResultQueueChan <- FileInfo
}
func ProcessWork(aWorkQueueChan chan *TJsonFileInfo, aDoneQueueChan chan *TJsonFileInfo) {
FileInfo := <-aWorkQueueChan
ChannelTracer.C += 1
fmt.Printf("C. Processed file: %s \n", FileInfo.FileName)
aDoneQueueChan <- FileInfo
}
func main() {
marshalChan := make(chan *TJsonFileInfo)
processChan := make(chan *TJsonFileInfo)
doneProcessingChan := make(chan *TJsonFileInfo)
for _, fileName := range jsonFileList {
go LoadJsonFiles(fileName, marshalChan)
go UnmarshalFile(marshalChan, processChan)
go ProcessWork(processChan, doneProcessingChan)
}
for {
select {
case result := <-marshalChan:
result.FileName = result.FileName // dummy use
case result := <-processChan:
result.FileName = result.FileName // dummy use
case result := <-doneProcessingChan:
result.FileName = result.FileName // dummy use
fmt.Printf("Done%s Channels visited: %v\n", ".", ChannelTracer)
}
}
}
/**
RESULTS (for phases A, B and C):
A. Loaded file: ./files/classA.json
A. Loaded file: ./files/classB.json
A. Loaded file: ./files/classC.json
B. Marshalled file: ./files/classB.json
B. Marshalled file: ./files/classC.json
C. Processed file: ./files/classB.json
C. Processed file: ./files/classC.json
Done. Channels visited: {3 2 2} // ChannelTracer for phase A, B and C
Done. Channels visited: {3 2 2}
fatal error: all goroutines are asleep - deadlock!
*/
Note that this code doesn't access the file system so it should run on the PlayGround.
EDIT2: - Apart from the unsafe "ChannelTracer" I can avoid deadlocks only by consuming doneProcessingChannel the same number of times as the file tasks.
Run the code here: Playground
func main() {
marshalChan := make(chan *TJsonFileInfo)
processChan := make(chan *TJsonFileInfo)
doneProcessingChan := make(chan *TJsonFileInfo)
go UnmarshalFiles(marshalChan, processChan)
go ProcessWork(processChan, doneProcessingChan)
for _, fileName := range jsonFileList {
go LoadJsonFiles(fileName, marshalChan)
}
// Read doneProcessingChan equal number of times
// as the spawned tasks (files) above :
for i := 0; i < len(jsonFileList); i++ {
<-doneProcessingChan
fmt.Printf("Done%s Channels visited: %v\n", ".", ChannelTracer)
}
}
// RIL

building on the answer by #BraveNewCurrency I have composed a simplistic example program for you:
package main
import (
"encoding/json"
"fmt"
"os"
)
type Result struct {
Some string
Another string
AndAn int
}
func generateWork(work chan *os.File) {
files := []string{
"/home/foo/a.json",
"/home/foo/b.json",
"/home/foo/c.json",
}
for _, path := range files {
file, e := os.Open(path)
if e != nil {
panic(e)
}
work <- file
}
}
func processWork(work chan *os.File, done chan Result) {
file := <-work
decoder := json.NewDecoder(file)
result := Result{}
decoder.Decode(&result)
done <- result
}
func main() {
work := make(chan *os.File)
go generateWork(work)
done := make(chan Result)
for i := 0; i < 100; i++ {
go processWork(work, done)
}
for {
select {
case result := <-done:
// a result is available
fmt.Println(result)
}
}
}
Note that this program won't work on the playground because file-system access is disallowed there.
Edit:
To answer the edition in your question, I've taken the code and changed some small things:
package main
import (
_ "encoding/json"
"fmt"
_ "io/ioutil"
_ "os"
)
type TJsonMetaInfo struct {
MetaSystem string
}
type TJsonFileInfo struct {
FileName string
}
type TChannelTracer struct { // Will count & display visited phases A, B, C
A, B, C int
}
var ChannelTracer TChannelTracer
var jsonFileList = []string{
"./files/classA.json",
"./files/classB.json",
"./files/classC.json",
}
func LoadJsonFiles(aFileName string, aResultQueueChan chan *TJsonFileInfo) {
newFileInfo := TJsonFileInfo{aFileName}
// file, e := ioutil.ReadFile(newFileInfo.FileName)
// etc...
ChannelTracer.A += 1
fmt.Printf("A. Loaded file: %s\n", newFileInfo.FileName)
aResultQueueChan <- &newFileInfo
}
func UnmarshalFiles(aWorkQueueChan chan *TJsonFileInfo, aResultQueueChan chan *TJsonFileInfo) {
for {
FileInfo := <-aWorkQueueChan
ChannelTracer.B += 1
fmt.Printf("B. Unmarshalled file: %s\n", FileInfo.FileName)
aResultQueueChan <- FileInfo
}
}
func ProcessWork(aWorkQueueChan chan *TJsonFileInfo, aDoneQueueChan chan *TJsonFileInfo) {
for {
FileInfo := <-aWorkQueueChan
ChannelTracer.C += 1
fmt.Printf("C. Processed file: %s \n", FileInfo.FileName)
aDoneQueueChan <- FileInfo
}
}
func main() {
marshalChan := make(chan *TJsonFileInfo)
processChan := make(chan *TJsonFileInfo)
doneProcessingChan := make(chan *TJsonFileInfo)
go UnmarshalFiles(marshalChan, processChan)
go ProcessWork(processChan, doneProcessingChan)
for _, fileName := range jsonFileList {
go LoadJsonFiles(fileName, marshalChan)
}
for {
select {
case result := <-doneProcessingChan:
result.FileName = result.FileName // dummy use
fmt.Printf("Done%s Channels visited: %v\n", ".", ChannelTracer)
}
}
}
Note that this code still deadlocks but at the end, when all work is complete, in the last empty for loop in main().
Note also that these lines:
ChannelTracer.A += 1
ChannelTracer.B += 1
ChannelTracer.C += 1
are not concurrency-safe. This means that in a multi-threaded environment one goroutine and the other might try to increment the same counter at the same time, resulting in a wrong count. To come around this issue, take a look at the following packages:
http://golang.org/pkg/sync/
http://golang.org/pkg/sync/atomic/

You should structure your program this way:
1) the main routine creates a channel for "work to do" and probably one for "done work" (both channels should probably have some buffering)
2) spin off one goroutine to generate the file list and put them in the "work to do" channel.
3) spin up N goroutines (in a for loop) to process files. The routine will read the file from the "work to do" channel, process it, and send the response to the "done work" channel.
4) the main routine waits on "done work" and prints them or whatever.
The optimal "N" above varies depending on the problem
- If your work is CPU bound, the optimal N should be about the number of processors in your system.
- If your work is disk bound, performance may actually go down as you increase N because multiple workers will cause more random I/O.
- If your work pulls files from many remote computers (think webcrawling), then the optimal N might be very high (100 or even 1000).

Related

How to return value from aggregate function over a chan [duplicate]

This question already has answers here:
mixture of field:value and value initializers
(2 answers)
how to provide a value for an imported embedded struct literal?
(1 answer)
Closed 6 days ago.
I have aggregate function. I m sending data to this function through a channel. Once I process the data, I have to send back updated information to each original caller. Aggregation help us to improve latency.
I m trying to send a struct of channel & int over a channel. Aggregate function will send result back via channel inside struct, to the original caller. This is what I have tried, (Playground link)
package main
import (
"context"
"fmt"
"time"
)
// Original at https://elliotchance.medium.com/batch-a-channel-by-size-or-time-in-go-92fa3098f65
// This works.
func BatchStringsCtx[T any](ctx context.Context, values <-chan T, maxItems int, maxTimeout time.Duration) chan []T {
batches := make(chan []T)
go func() {
defer close(batches)
for keepGoing := true; keepGoing; {
var batch []T
expire := time.After(maxTimeout)
for {
select {
case <-ctx.Done():
keepGoing = false
goto done
case value, ok := <-values:
if !ok {
keepGoing = false
goto done
}
batch = append(batch, value)
if len(batch) == maxItems {
goto done
}
case <-expire:
goto done
}
}
done:
if len(batch) > 0 {
batches <- batch
}
}
}()
return batches
}
type ER struct{
e int
r chan int
}
// Process will do aggregation and some processing over the batch. Now result should go back to caller of each ER
func process(strings chan ER){
ctx := context.Background()
batches := BatchStringsCtx[ER](ctx, strings, 2, 10*time.Millisecond)
for batch := range batches {
for _, b := range batch{ // 2 elem in batch
b.r <- b.e + 100 // some operation. Batching helps in improving latency.
}
}
}
func main() {
strings := make(chan ER)
go process(strings)
er := ER{ e:0, make(chan chan int)}
er1 := ER{ e:1, make(chan chan int)}
go func() {
strings <- er
strings <- er1
close(strings)
}()
fmt.Println(<-er.r, <-er1.r) // print 100, 101
}
But I get these errors,
./prog.go:71:17: mixture of field:value and value elements in struct literal
./prog.go:72:18: mixture of field:value and value elements in struct literal
Any idea, what can be improved?
Do below changes to your code snippet.
er := ER{ e:0, make(chan chan int)}
er1 := ER{ e:1, make(chan chan int)}
Above two lines should be as below,
er := ER{e: 0, r: make(chan int)}
er1 := ER{e: 1, r: make(chan int)}

Program goes into deadlock using waitgroup

I'm writing a program that reads a list of order numbers in a file called orders.csv and compares it with the other csv files that are present in the folder.
The problem is that it goes into deadlock even using waitgroup and I don't know why.
For some reason stackoverflow says that my post is mostly code, so I have to add this line, because the whole code is necessary if someone wants to help me debug this problem I'm having.
package main
import (
"bufio"
"fmt"
"log"
"os"
"path/filepath"
"strings"
"sync"
)
type Files struct {
filenames []string
}
type Orders struct {
ID []string
}
var ordersFilename string = "orders.csv"
func main() {
var (
ordersFile *os.File
files Files
orders Orders
err error
)
mu := new(sync.Mutex)
wg := &sync.WaitGroup{}
wg.Add(1)
if ordersFile, err = os.Open(ordersFilename); err != nil {
log.Fatalln("Could not open file: " + ordersFilename)
}
orders = getOrderIDs(ordersFile)
files.filenames = getCSVsFromCurrentDir()
var filenamesSize = len(files.filenames)
var ch = make(chan map[string][]string, filenamesSize)
var done = make(chan bool)
for i, filename := range files.filenames {
go func(currentFilename string, ch chan<- map[string][]string, i int, orders Orders, wg *sync.WaitGroup, filenamesSize *int, mu *sync.Mutex, done chan<- bool) {
wg.Add(1)
defer wg.Done()
checkFile(currentFilename, orders, ch)
mu.Lock()
*filenamesSize--
mu.Unlock()
if i == *filenamesSize {
done <- true
close(done)
}
}(filename, ch, i, orders, wg, &filenamesSize, mu, done)
}
select {
case str := <-ch:
fmt.Printf("%+v\n", str)
case <-done:
wg.Done()
break
}
wg.Wait()
close(ch)
}
// getCSVsFromCurrentDir returns a string slice
// with the filenames of csv files inside the
// current directory that are not "orders.csv"
func getCSVsFromCurrentDir() []string {
var filenames []string
err := filepath.Walk(".", func(path string, info os.FileInfo, err error) error {
if path != "." && strings.HasSuffix(path, ".csv") && path != ordersFilename {
filenames = append(filenames, path)
}
return nil
})
if err != nil {
log.Fatalln("Could not read file names in current dir")
}
return filenames
}
// getOrderIDs returns an Orders struct filled
// with order IDs retrieved from the file
func getOrderIDs(file *os.File) Orders {
var (
orders Orders
err error
fileContent string
)
reader := bufio.NewReader(file)
if fileContent, err = readLine(reader); err != nil {
log.Fatalln("Could not read file: " + ordersFilename)
}
for err == nil {
orders.ID = append(orders.ID, fileContent)
fileContent, err = readLine(reader)
}
return orders
}
func checkFile(filename string, orders Orders, ch chan<- map[string][]string) {
var (
err error
file *os.File
fileContent string
orderFilesMap map[string][]string
counter int
)
orderFilesMap = make(map[string][]string)
if file, err = os.Open(filename); err != nil {
log.Fatalln("Could not read file: " + filename)
}
reader := bufio.NewReader(file)
if fileContent, err = readLine(reader); err != nil {
log.Fatalln("Could not read file: " + filename)
}
for err == nil {
if containedInSlice(fileContent, orders.ID) && !containedInSlice(fileContent, orderFilesMap[filename]) {
orderFilesMap[filename] = append(orderFilesMap[filename], fileContent)
// fmt.Println("Found: ", fileContent, " in ", filename)
} else {
// fmt.Printf("Could not find: '%s' in '%s'\n", fileContent, filename)
}
counter++
fileContent, err = readLine(reader)
}
ch <- orderFilesMap
}
// containedInSlice returns true or false
// based on whether the string is contained
// in the slice
func containedInSlice(str string, slice []string) bool {
for _, ID := range slice {
if ID == str {
return true
}
}
return false
}
// readLine returns a line from the passed reader
func readLine(r *bufio.Reader) (string, error) {
var (
isPrefix bool = true
err error = nil
line, ln []byte
)
for isPrefix && err == nil {
line, isPrefix, err = r.ReadLine()
ln = append(ln, line...)
}
return string(ln), err
}
The first issue is the wg.Add always must be outside of the goroutine(s) it stands for. If it isn't, the
wg.Wait call might be called before the goutine(s) have actually started running (and called wg.Add) and therefore will "think"
that there is nothing to wait for.
The second issue with the code is that there are multiple ways it waits for the routines to be done. There is
the WaitGroup and there is the done channel. Use only one of them. Which one depends also on how the results of the
goroutines are used. Here we come to the next problem.
The third issue is with gathering the results. Currently the code only prints / uses a single result from the goroutines.
Put a for { ... } loop around the select and use return to break out of the loop if the done channel is closed.
(Note that you don't need to send anything on the done channel, closing it is enough.)
Improved Version 0.0.1
So here the first version (including some other "code cleanup") with a done channel used for closing and the WaitGroup removed:
func main() {
ordersFile, err := os.Open(ordersFilename)
if err != nil {
log.Fatalln("Could not open file: " + ordersFilename)
}
orders := getOrderIDs(ordersFile)
files := Files{
filenames: getCSVsFromCurrentDir(),
}
var (
mu = new(sync.Mutex)
filenamesSize = len(files.filenames)
ch = make(chan map[string][]string, filenamesSize)
done = make(chan bool)
)
for i, filename := range files.filenames {
go func(currentFilename string, ch chan<- map[string][]string, i int, orders Orders, filenamesSize *int, mu *sync.Mutex, done chan<- bool) {
checkFile(currentFilename, orders, ch)
mu.Lock()
*filenamesSize--
mu.Unlock()
// TODO: This also accesses filenamesSize, so it also needs to be protected with the mutex:
if i == *filenamesSize {
done <- true
close(done)
}
}(filename, ch, i, orders, &filenamesSize, mu, done)
}
// Note: closing a channel is not really needed, so you can omit this:
defer close(ch)
for {
select {
case str := <-ch:
fmt.Printf("%+v\n", str)
case <-done:
return
}
}
}
Improved Version 0.0.2
In your case we have some advantage however. We know exactly how many goroutines we started and therefore also how
many results we expect. (Of course if each goroutine returns a result which currently this code does.) That gives
us another option as we can collect the results with another for loop having the same amount of iterations:
func main() {
ordersFile, err := os.Open(ordersFilename)
if err != nil {
log.Fatalln("Could not open file: " + ordersFilename)
}
orders := getOrderIDs(ordersFile)
files := Files{
filenames: getCSVsFromCurrentDir(),
}
var (
// Note: a buffered channel helps speed things up. The size does not need to match the size of the items that will
// be passed through the channel. A fixed, small size is perfect here.
ch = make(chan map[string][]string, 5)
)
for _, filename := range files.filenames {
go func(filename string) {
// orders and channel are not variables of the loop and can be used without copying
checkFile(filename, orders, ch)
}(filename)
}
for range files.filenames {
str := <-ch
fmt.Printf("%+v\n", str)
}
}
A lot simpler, isn't it? Hope that helps!
There is a lot wrong with this code.
You're using the WaitGroup wrong. Add has to be called in the main goroutine, else there is a chance that Wait is called before all Add calls complete.
There's an extraneous Add(1) call right after initializing the WaitGroup that isn't matched by a Done() call, so Wait will never return (assuming the point above is fixed).
You're using both a WaitGroup and a done channel to signal completion. This is redundant at best.
You're reading filenamesSize while not holding the lock (in the if i == *filenamesSize statement). This is a race condition.
The i == *filenamesSize condition makes no sense in the first place. Goroutines execute in an arbitrary order, so you can't be sure that the goroutine with i == 0 is the last one to decrement filenamesSize
This can all be simplified by getting rid of most if the synchronization primitives and simply closing the ch channel when all goroutines are done:
func main() {
ch := make(chan map[string][]string)
var wg WaitGroup
for _, filename := range getCSVsFromCurrentDir() {
filename := filename // capture loop var
wg.Add(1)
go func() {
checkFile(filename, orders, ch)
wg.Done()
}()
}
go func() {
wg.Wait() // after all goroutines are done...
close(ch) // let range loop below exit
}()
for str := range ch {
// ...
}
}
not an answer, but some comments that does not fit the comment box.
In this part of the code
func main() {
var (
ordersFile *os.File
files Files
orders Orders
err error
)
mu := new(sync.Mutex)
wg := &sync.WaitGroup{}
wg.Add(1)
The last statement is a call to wg.Add that appears dangling. By that i mean we can hardly understand what will trigger the required wg.Done counter part. This is a mistake to call for wg.Add without a wg.Done, this is prone to errors to not write them in such way we can not immediately find them in pair.
In that part of the code, it is clearly wrong
go func(currentFilename string, ch chan<- map[string][]string, i int, orders Orders, wg *sync.WaitGroup, filenamesSize *int, mu *sync.Mutex, done chan<- bool) {
wg.Add(1)
defer wg.Done()
Consider that by the time the routine is executed, and that you added 1 to the waitgroup, the parent routine continues to execute. See this example: https://play.golang.org/p/N9Chaqkv4bd
The main routine does not wait for the waitgroup because it does not have time to increment.
There is more to say but i find it hard to understand the purpose of your code so i am not sure how to help you further without basically rewrite it.

How to always get the latest value from a Go channel?

I'm starting out with Go and I'm now writing a simple program which reads out data from a sensor and puts that into a channel to do some calculations with it. I now have it working as follows:
package main
import (
"fmt"
"time"
"strconv"
)
func get_sensor_data(c chan float64) {
time.Sleep(1 * time.Second) // wait a second before sensor data starts pooring in
c <- 2.1 // Sensor data starts being generated
c <- 2.2
c <- 2.3
c <- 2.4
c <- 2.5
}
func main() {
s := 1.1
c := make(chan float64)
go get_sensor_data(c)
for {
select {
case s = <-c:
fmt.Println("the next value of s from the channel: " + strconv.FormatFloat(s, 'f', 1, 64))
default:
// no new values in the channel
}
fmt.Println(s)
time.Sleep(500 * time.Millisecond) // Do heavy "work"
}
}
This works fine, but the sensor generates a lot of data, and I'm always only interested in the latest data. With this setup however, it only reads out the next item with every loop, which means that if the channel at some point contains 20 values, the newest value only is read out after 10 seconds.
Is there a way for a channel to always only contain one value at a time, so that I always only get the data I'm interested in, and no unnecessary memory is used by the channel (although the memory is the least of my worries)?
Channels are best thought of as queues (FIFO). Therefore you can't really skip around. However there are libraries out there that do stuff like this: https://github.com/cloudfoundry/go-diodes is an atomic ring buffer that will overwrite old data. You can set a smaller size if you like.
All that being said, it doesn't sound like you need a queue (or ring buffer). You just need a mutex:
type SensorData struct{
mu sync.RWMutex
last float64
}
func (d *SensorData) Store(data float64) {
mu.Lock()
defer mu.Unlock()
d.last = data
}
func (d *SensorData) Get() float64 {
mu.RLock()
defer mu.RUnlock()
return d.last
}
This uses a RWMutex which means many things can read from it at the same time while only a single thing can write. It will store a single entry much like you said.
No. Channels are FIFO buffers, full stop. That is how channels work and their only purpose. If you only want the latest value, consider just using a single variable protected by a mutex; write to it whenever new data comes in, and whenever you read it, you will always be reading the latest value.
Channels serves a specific purpose. You might want to use a code that is inside a lock and update the variable whenever new value is to be set.
This way reciever will always get the latest value.
You cannot get that from one channel directly, but you can use one channel per value and get notified when there are new values:
package main
import (
"fmt"
"strconv"
"sync"
"time"
)
type LatestChannel struct {
n float64
next chan struct{}
mu sync.Mutex
}
func New() *LatestChannel {
return &LatestChannel{next: make(chan struct{})}
}
func (c *LatestChannel) Push(n float64) {
c.mu.Lock()
c.n = n
old := c.next
c.next = make(chan struct{})
c.mu.Unlock()
close(old)
}
func (c *LatestChannel) Get() (float64, <-chan struct{}) {
c.mu.Lock()
n := c.n
next := c.next
c.mu.Unlock()
return n, next
}
func getSensorData(c *LatestChannel) {
time.Sleep(1 * time.Second)
c.Push(2.1)
time.Sleep(100 * time.Millisecond)
c.Push(2.2)
time.Sleep(100 * time.Millisecond)
c.Push(2.3)
time.Sleep(100 * time.Millisecond)
c.Push(2.4)
time.Sleep(100 * time.Millisecond)
c.Push(2.5)
}
func main() {
s := 1.1
c := New()
_, hasNext := c.Get()
go getSensorData(c)
for {
select {
case <-hasNext:
s, hasNext = c.Get()
fmt.Println("the next value of s from the channel: " + strconv.FormatFloat(s, 'f', 1, 64))
default:
// no new values in the channel
}
fmt.Println(s)
time.Sleep(250 * time.Millisecond) // Do heavy "work"
}
}
If you do not need the notify about new value, you can try to read Channels inside channels pattern in Golang.
Try this package https://github.com/subbuv26/chanup
It allows the producer to update the channel with latest value, which replaces the latest value. And produces does not get blocked. (with this, stale values gets overridden).
So, on the consumer side, always only the latest item gets read.
import "github.com/subbuv26/chanup"
ch := chanup.GetChan()
_ := ch.Put(testType{
a: 10,
s: "Sample",
})
_ := ch.Update(testType{
a: 20,
s: "Sample2",
})
// Continue updating with latest values
...
...
// On consumer end
val := ch.Get()
// val contains latest value
There is another way to solve this problem (trick)
sender work faster: sender remove channel if channel_length > 1
go func() {
for {
msg:=strconv.Itoa(int(time.Now().Unix()))
fmt.Println("make: ",msg," at:",time.Now())
messages <- msg
if len(messages)>1{
//remove old message
<-messages
}
time.Sleep(2*time.Second)
}
}()
receiver work slower:
go func() {
for {
channLen :=len(messages)
fmt.Println("len is ",channLen)
fmt.Println("received",<-messages)
time.Sleep(10*time.Second)
}
}()
OR, we can delete old message from receiver side
(read message like delete it)
There is an elegant channel-only solution. If you're OK with adding one more channel and goroutine - you can introduce a buferless channel and a goroutine that tries to send the latest value from your channel to it:
package main
import (
"fmt"
"time"
)
func wrapLatest(ch <-chan int) <-chan int {
result := make(chan int) // important that this one i unbuffered
go func() {
defer close(result)
value, ok := <-ch
if !ok {
return
}
LOOP:
for {
select {
case value, ok = <-ch:
if !ok {
return
}
default:
break LOOP
}
}
for {
select {
case value, ok = <-ch:
if !ok {
return
}
case result <- value:
if value, ok = <-ch; !ok {
return
}
}
}
}()
return result
}
func main() {
sendChan := make(chan int, 10) // may be buffered or not
for i := 0; i < 10; i++ {
sendChan <- i
}
go func() {
for i := 10; i < 20; i++ {
sendChan <- i
time.Sleep(time.Second)
}
close(sendChan)
}()
recvChan := wrapLatest(sendChan)
for i := range recvChan {
fmt.Println(i)
time.Sleep(time.Second * 2)
}
}

Synchronize workers for recursive crawl

I would like to implement a "crawler" with n workers where each worker is able to add additional jobs. The program should stop when there are no jobs left and all workers have finished their work.
I have the following code (you can play with it at https://play.golang.org/p/_j22p_OfYv):
package main
import (
"fmt"
"sync"
)
func main() {
pathChan := make(chan string)
fileChan := make(chan string)
workers := 3
var wg sync.WaitGroup
paths := map[string][]string{
"/": {"/test", "/foo", "a", "b"},
"/test": {"aa", "bb", "cc"},
"/foo": {"/bar", "bbb", "ccc"},
"/bar": {"aaaa", "bbbb", "cccc"},
}
for i := 0; i < workers; i++ {
wg.Add(1)
go func() {
for {
path, ok := <-pathChan
if !ok {
break
}
for _, f := range paths[path] {
if f[0] == '/' {
pathChan <- f
} else {
fileChan <- f
}
}
}
wg.Done()
}()
}
pathChan <- "/"
for {
filePath, ok := <-fileChan
if !ok {
break
}
fmt.Println(filePath)
}
wg.Wait()
close(pathChan)
}
Unfortunately, this ends in a dead-lock. Where exactly is the problem? Also, what is the best practice to write such functionality? Are channels the correct feature to use?
EDIT:
I have updated my code to use two wait groups, one for the jobs and one for the workers (see https://play.golang.org/p/bueUJzMhqj):
package main
import (
"fmt"
"sync"
)
func main() {
pathChan := make(chan string)
fileChan := make(chan string)
jobs := new(sync.WaitGroup)
workers := new(sync.WaitGroup)
nworkers := 2
paths := map[string][]string{
"/": {"/test", "/foo", "a", "b"},
"/test": {"aa", "bb", "cc"},
"/foo": {"/bar", "bbb", "ccc"},
"/bar": {"aaaa", "bbbb", "cccc"},
}
for i := 0; i < nworkers; i++ {
workers.Add(1)
go func() {
defer workers.Done()
for {
path, ok := <-pathChan
if !ok {
break
}
for _, f := range paths[path] {
if f[0] == '/' {
jobs.Add(1)
pathChan <- f
} else {
fileChan <- f
}
}
jobs.Done()
}
}()
}
jobs.Add(1)
pathChan <- "/"
go func() {
jobs.Wait()
close(pathChan)
workers.Wait()
close(fileChan)
}()
for {
filePath, ok := <-fileChan
if !ok {
break
}
fmt.Println(filePath)
}
}
This indeed seems to work, but obviously a deadlock will still happen if nworkers is set to 1, because the single worker will wait forever when adding something to the channel pathChan. To solve this issue, the channel buffer can be increased (e.g. pathChan := make(chan string, 2)), but this will only work as long as two buffer isn't completely full. Of course, the buffer size could be set to a large number, say 10000, but the code could still hit a deadlock. Additionally, this doesn't seem to be a clean solution to me.
This is where I realized that it would be easier to use some sort of queue instead of a channel, where elements can be added and removed without blocking and where the size of the queue isn't fixed. Do such queues exist in the Go standard library?
If you want to wait for an arbitrary number of workers to finish, the standard library includes sync.WaitGroup for exactly this purpose.
There are other concurrency issues as well:
You're using channel closure signalling, but you have multiple goroutines sending on the same channel. This is generally bad practice: since each routine can never know when the other routines are done with the channel, you can never correctly close the channel.
Closing one channel waits on the other to be closed first, but it will never be closed, so it deadlocks.
The only reason it doesn't deadlock immediately is your example happens to have more workers than directories under "/". Add two more directories under "/" and it deadlocks immediately.
There are some solutions:
Dump the worker pool and just spin a goroutine for every subdirectory, and let the scheduler worry about the rest: https://play.golang.org/p/ck2DkNFnyF
Use one worker per root-level directory, and have each worker process its directory recursively rather than queuing subdirectories it finds to a channel.

Selecting between time interval and length of channel

I'm here to find out the most idiomatic way to do the follow task.
Task:
Write data from a channel to a file.
Problem:
I have a channel ch := make(chan int, 100)
I need to read from the channel and write the values I read from the channel to a file. My question is basically how do I do so given that
If channel ch is full, write the values immediately
If channel ch is not full, write every 5s.
So essentially, data needs to be written to the file at least every 5s (assuming that data will be filled into the channel at least every 5s)
Whats the best way to use select, for and range to do my above task?
Thanks!
There is no such "event" as "buffer of channel is full", so you can't detect that [*]. This means you can't idiomatically solve your problem with language primitives using only 1 channel.
[*] Not entirely true: you could detect if the buffer of a channel is full by using select with default case when sending on the channel, but that requires logic from the senders, and repetitive attempts to send.
I would use another channel from which I would receive as values are sent on it, and "redirect", store the values in another channel which has a buffer of 100 as you mentioned. At each redirection you may check if the internal channel's buffer is full, and if so, do an immediate write. If not, continue to monitor the "incoming" channel and a timer channel with a select statement, and if the timer fires, do a "regular" write.
You may use len(chInternal) to check how many elements are in the chInternal channel, and cap(chInternal) to check its capacity. Note that this is "safe" as we are the only goroutine handling the chInternal channel. If there would be multiple goroutines, value returned by len(chInternal) could be outdated by the time we use it to something (e.g. comparing it).
In this solution chInternal (as its name says) is for internal use only. Others should only send values on ch. Note that ch may or may not be a buffered channel, solution works in both cases. However, you may improve efficiency if you also give some buffer to ch (so chances that senders get blocked will be lower).
var (
chInternal = make(chan int, 100)
ch = make(chan int) // You may (should) make this a buffered channel too
)
func main() {
delay := time.Second * 5
timer := time.NewTimer(delay)
for {
select {
case v := <-ch:
chInternal <- v
if len(chInternal) == cap(chInternal) {
doWrite() // Buffer is full, we need to write immediately
timer.Reset(delay)
}
case <-timer.C:
doWrite() // "Regular" write: 5 seconds have passed since last write
timer.Reset(delay)
}
}
}
If an immediate write happens (due to a "buffer full" situation), this solution will time the next "regular" write 5 seconds after this. If you don't want this and you want the 5-second regular writes be independent from the immediate writes, simply do not reset the timer following the immediate write.
An implementation of doWrite() may be as follows:
var f *os.File // Make sure to open file for writing
func doWrite() {
for {
select {
case v := <-chInternal:
fmt.Fprintf(f, "%d ", v) // Write v to the file
default: // Stop when no more values in chInternal
return
}
}
}
We can't use for ... range as that only returns when the channel is closed, but our chInternal channel is not closed. So we use a select with a default case so when no more values are in the buffer of chInternal, we return.
Improvements
Using a slice instead of 2nd channel
Since the chInternal channel is only used by us, and only on a single goroutine, we may also choose to use a single []int slice instead of a channel (reading/writing a slice is much faster than a channel).
Showing only the different / changed parts, it could look something like this:
var (
buf = make([]int, 0, 100)
)
func main() {
// ...
for {
select {
case v := <-ch:
buf = append(buf, v)
if len(buf) == cap(buf) {
// ...
}
}
func doWrite() {
for _, v := range buf {
fmt.Fprintf(f, "%d ", v) // Write v to the file
}
buf = buf[:0] // "Clear" the buffer
}
With multiple goroutines
If we stick to leave chInternal a channel, the doWrite() function may be called on another goroutine to not block the other one, e.g. go doWrite(). Since data to write is read from a channel (chInternal), this requires no further synchronization.
if you just use 5 seconds write, to increase the file write performance,
you may fill the channel any time you need,
then writer goroutine writes that data to the buffered file,
see this very simple and idiomatic sample without using timer
with just using for...range:
package main
import (
"bufio"
"fmt"
"os"
"sync"
)
var wg sync.WaitGroup
func WriteToFile(filename string, ch chan int) {
f, e := os.Create(filename)
if e != nil {
panic(e)
}
w := bufio.NewWriterSize(f, 4*1024*1024)
defer wg.Done()
defer f.Close()
defer w.Flush()
for v := range ch {
fmt.Fprintf(w, "%d ", v)
}
}
func main() {
ch := make(chan int, 100)
wg.Add(1)
go WriteToFile("file.txt", ch)
for i := 0; i < 500000; i++ {
ch <- i // do the job
}
close(ch) // Finish the job and close output file
wg.Wait()
}
and notice the defers order.
and in case of 5 seconds write, you may add one interval timer just to flush the buffer of this file to the disk, like this:
package main
import (
"bufio"
"fmt"
"os"
"sync"
"time"
)
var wg sync.WaitGroup
func WriteToFile(filename string, ch chan int) {
f, e := os.Create(filename)
if e != nil {
panic(e)
}
w := bufio.NewWriterSize(f, 4*1024*1024)
ticker := time.NewTicker(5 * time.Second)
quit := make(chan struct{})
go func() {
for {
select {
case <-ticker.C:
if w.Buffered() > 0 {
fmt.Println(w.Buffered())
w.Flush()
}
case <-quit:
ticker.Stop()
return
}
}
}()
defer wg.Done()
defer f.Close()
defer w.Flush()
defer close(quit)
for v := range ch {
fmt.Fprintf(w, "%d ", v)
}
}
func main() {
ch := make(chan int, 100)
wg.Add(1)
go WriteToFile("file.txt", ch)
for i := 0; i < 25; i++ {
ch <- i // do the job
time.Sleep(500 * time.Millisecond)
}
close(ch) // Finish the job and close output file
wg.Wait()
}
here I used time.NewTicker(5 * time.Second) for interval timer with quit channel, you may use time.AfterFunc() or time.Tick() or time.Sleep().
with some optimizations ( removing quit channel):
package main
import (
"bufio"
"fmt"
"os"
"sync"
"time"
)
var wg sync.WaitGroup
func WriteToFile(filename string, ch chan int) {
f, e := os.Create(filename)
if e != nil {
panic(e)
}
w := bufio.NewWriterSize(f, 4*1024*1024)
ticker := time.NewTicker(5 * time.Second)
defer wg.Done()
defer f.Close()
defer w.Flush()
for {
select {
case v, ok := <-ch:
if ok {
fmt.Fprintf(w, "%d ", v)
} else {
fmt.Println("done.")
ticker.Stop()
return
}
case <-ticker.C:
if w.Buffered() > 0 {
fmt.Println(w.Buffered())
w.Flush()
}
}
}
}
func main() {
ch := make(chan int, 100)
wg.Add(1)
go WriteToFile("file.txt", ch)
for i := 0; i < 25; i++ {
ch <- i // do the job
time.Sleep(500 * time.Millisecond)
}
close(ch) // Finish the job and close output file
wg.Wait()
}
I hope this helps.

Resources