Understanding correct use of channels in golang concurrent context - go

I am writing a go project which is a simple web crawler to crawl links on the website. I want to experiment the concurrent features such as goroutines and channels. But when I run it it didn't go through. Nothing is showed as if there is nothing happening. I have no idea what went wrong. Can somebody point it out for me?
It works and shows all the crawled links if I remove the channels logic but I want it to send the links into a buffered channel and then display the links before ending the program. The program is supposed to be able to go to any depth as specified in the program. Currently the depth is 1.
package main
import (
"fmt"
"log"
"net/http"
"os"
"strings"
"time"
"golang.org/x/net/html"
)
// Link type to be sent over channel
type Link struct {
URL string
ok bool
}
func main() {
if len(os.Args) != 2 {
fmt.Println("Usage: crawl [URL].")
}
url := os.Args[1]
if !strings.HasPrefix(url, "http://") {
url = "http://" + url
}
ch := make(chan *Link, 5)
crawl(url, 1, ch)
visited := make(map[string]bool)
time.Sleep(2 * time.Second)
for link := range ch {
if _, ok := visited[link.URL]; !ok {
visited[link.URL] = true
}
}
close(ch)
for l := range visited {
fmt.Println(l)
}
}
func crawl(url string, n int, ch chan *Link) {
if n < 1 {
return
}
resp, err := http.Get(url)
if err != nil {
log.Fatalf("Can not reach the site. Error = %v\n", err)
os.Exit(1)
}
b := resp.Body
defer b.Close()
z := html.NewTokenizer(b)
nextN := n - 1
for {
token := z.Next()
switch token {
case html.ErrorToken:
return
case html.StartTagToken:
current := z.Token()
if current.Data != "a" {
continue
}
result, ok := getHrefTag(current)
if !ok {
continue
}
hasProto := strings.HasPrefix(result, "http")
if hasProto {
go crawl(result, nextN, ch)
ch <- &Link{result, true}
}
}
}
}
func getHrefTag(token html.Token) (result string, ok bool) {
for _, a := range token.Attr {
if a.Key == "href" {
result = a.Val
ok = true
break
}
}
return
}
UPDATED:
After some fiddling I figured out to change the code to remove the data races, however I still don't know how to avoid crawling urls that were visited previously (maybe I should start another question?):
package main
import (
"fmt"
"log"
"net/http"
"os"
"strings"
"golang.org/x/net/html"
)
func main() {
if len(os.Args) != 2 {
fmt.Println("Usage: crawl [URL].")
}
url := os.Args[1]
if !strings.HasPrefix(url, "http://") {
url = "http://" + url
}
for link := range newCrawl(url, 1) {
fmt.Println(link)
}
}
func newCrawl(url string, num int) chan string {
ch := make(chan string, 20)
go func() {
crawl(url, 1, ch)
close(ch)
}()
return ch
}
func crawl(url string, n int, ch chan string) {
if n < 1 {
return
}
resp, err := http.Get(url)
if err != nil {
log.Fatalf("Can not reach the site. Error = %v\n", err)
os.Exit(1)
}
b := resp.Body
defer b.Close()
z := html.NewTokenizer(b)
nextN := n - 1
for {
token := z.Next()
switch token {
case html.ErrorToken:
return
case html.StartTagToken:
current := z.Token()
if current.Data != "a" {
continue
}
result, ok := getHrefTag(current)
if !ok {
continue
}
hasProto := strings.HasPrefix(result, "http")
if hasProto {
done := make(chan struct{})
go func() {
crawl(result, nextN, ch)
close(done)
}()
<-done
ch <- result
}
}
}
}
func getHrefTag(token html.Token) (result string, ok bool) {
for _, a := range token.Attr {
if a.Key == "href" {
result = a.Val
ok = true
break
}
}
return
}

I think that recursive calling of goroutines is not good idea. It can simply goes out of control.. I would prefer more flat model like this:
package main
import (
"fmt"
"log"
"net/http"
"os"
"strings"
"sync"
"golang.org/x/net/html"
)
func main() {
if len(os.Args) != 2 {
fmt.Println("Usage: crawl [URL].")
}
url := os.Args[1]
if !strings.HasPrefix(url, "http://") {
url = "http://" + url
}
wg := NewWorkGroup(1)
wg.Crawl(url)
for k, v := range wg.urlMap {
fmt.Printf("%s: %d\n", k, v)
}
}
// represents single link and its deph
type Link struct {
url string
deph uint32
}
// wraps all around to group
type WorkGroup struct {
*sync.WaitGroup
maxDeph uint32
numW int
pool chan *Worker
linkQ chan Link
urlMap map[string]uint32
}
type Worker struct {
result chan []Link
}
func newWorker() *Worker {
return &Worker{
result: make(chan []Link),
}
}
func NewWorkGroup(maxDeph uint32) *WorkGroup {
numW := int(maxDeph)
if maxDeph > 10 {
numW = 10
}
return &WorkGroup{
WaitGroup: new(sync.WaitGroup),
maxDeph: maxDeph,
numW: numW,
pool: make(chan *Worker, numW),
linkQ: make(chan Link, 100),
urlMap: make(map[string]uint32),
}
}
// dispatch workers -> filter visited -> send not visited to channel
// pool + dispatcher keep order so workers go level by level
func (wg *WorkGroup) spawnDispatcher() {
wg.Add(1)
go func() {
defer wg.Done()
defer close(wg.linkQ)
for w := range wg.pool {
links := <-w.result
for i := 0; i < len(links); i++ {
if _, ok := wg.urlMap[links[i].url]; !ok {
wg.urlMap[links[i].url] = links[i].deph
// dont process links that reach max deph
if links[i].deph < wg.maxDeph {
select {
case wg.linkQ <- links[i]:
// goes well
continue
default:
// channel is too short, protecting possible deadlock
}
// drop rest of links
break
}
}
}
// empty link channel + nothing in process = end
if len(wg.linkQ) == 0 && len(wg.pool) == 0 {
return
}
}
}()
}
//initialize goroutines and crawl url
func (wg *WorkGroup) Crawl(url string) {
defer close(wg.pool)
wg.spawnCrawlers()
wg.spawnDispatcher()
wg.linkQ <- Link{url: url, deph: 0}
wg.Wait()
}
func (wg *WorkGroup) spawnCrawlers() {
// custom num of workers, used maxDeph
for i := 0; i < wg.numW; i++ {
wg.newCrawler()
}
}
func (wg *WorkGroup) newCrawler() {
wg.Add(1)
go func(w *Worker) {
defer wg.Done()
defer close(w.result)
for link := range wg.linkQ {
wg.pool <- w
w.result <- getExternalUrls(link)
}
}(newWorker())
}
// default sligtly modified crawl function
func getExternalUrls(source Link) []Link {
resp, err := http.Get(source.url)
if err != nil {
log.Printf("Can not reach the site. Error = %v\n", err)
return nil
}
b := resp.Body
defer b.Close()
z := html.NewTokenizer(b)
links := []Link{}
for {
token := z.Next()
switch token {
case html.ErrorToken:
return links
case html.StartTagToken:
current := z.Token()
if current.Data != "a" {
continue
}
url, ok := getHrefTag(current)
if ok && strings.HasPrefix(url, "http") {
links = append(links, Link{url: url, deph: source.deph + 1})
}
}
}
return links
}
//default function
func getHrefTag(token html.Token) (result string, ok bool) {
for _, a := range token.Attr {
if a.Key == "href" {
result = a.Val
ok = true
break
}
}
return
}

Related

Why is my Go program so slow when navigating the files

Why is this program so slow? I thought the code was fairly optimized, but it takes significantly long than the find command when use on my root filesystem.
It takes about 4 minutes, as opposed to the find command which takes about 40 seconds.
I tried removing the sorting algorithm, but doesn't speed up the program.
package main
import (
"fmt"
"io"
"io/fs"
"log"
"os"
"sort"
"sync"
"github.com/google/fscrypt/filesystem"
"github.com/sirupsen/logrus"
"gopkg.in/alecthomas/kingpin.v2"
)
var (
mountpoint = kingpin.Flag("mount", "The mount to find the largest file usages. Can be a subath of mount").Required().String()
limit = kingpin.Flag("limit", "The maximum number of files return to the display").Default("10").Short('l').Int()
)
var device string
type fileDisplay struct {
Size int64
Path string
}
type bySize []fileDisplay
func (a bySize) Len() int { return len(a) }
func (a bySize) Less(i, j int) bool { return a[i].Size < a[j].Size }
func (a bySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
var fileChan = make(chan fileDisplay)
var files []fileDisplay
func main() {
log.SetOutput(io.Discard)
kingpin.Version("0.0.1")
kingpin.Parse()
//Define limit after parsing
logrus.SetLevel(logrus.FatalLevel)
if (*mountpoint)[len(*mountpoint)-1:] != "/" {
*mountpoint = *mountpoint + "/"
}
fmt.Println("Finding the top", *limit, "largest files on filesystem", *mountpoint, "\n================================================")
mount, err := filesystem.FindMount(*mountpoint)
if err != nil {
logrus.Fatal(err)
}
device = mount.Device
entries, err := os.ReadDir(*mountpoint)
if err != nil {
logrus.Fatal(err)
}
var wg sync.WaitGroup
getFiles(*mountpoint, entries, &wg)
go func() {
defer close(fileChan)
wg.Wait()
}()
var last int64
for file := range fileChan {
if file.Size > last {
files = append(files, file)
} else {
files = append([]fileDisplay{file}, files...)
}
}
sort.Sort(bySize(files))
var shortFiles []fileDisplay
if len(files) > *limit {
shortFiles = files[len(files)-*limit:]
} else {
shortFiles = files
}
for _, file := range shortFiles {
fmt.Println(file.Path, file.DisplaySizeIEC())
}
}
func getFiles(start string, entries []fs.DirEntry, wg *sync.WaitGroup) {
for _, entry := range entries {
wg.Add(1)
go handleEntry(start, entry, wg)
}
}
func handleEntry(start string, entry fs.DirEntry, wg *sync.WaitGroup) {
defer wg.Done()
var file fileDisplay
mount, err := filesystem.FindMount(start + entry.Name())
if err != nil {
logrus.Fatalln(err, start+entry.Name())
return
}
if mount.Device == device {
if entry.Type().IsRegular() {
fileInfo, err := os.Stat(start + entry.Name())
if err != nil {
logrus.Fatalln(err, start+entry.Name())
return
}
file.Path = start + entry.Name()
file.Size = fileInfo.Size()
fileChan <- file
} else if entry.IsDir() {
entries, err := os.ReadDir(start + entry.Name())
if err != nil {
logrus.Fatalln(err, start+entry.Name())
return
}
logrus.Info("Searching ", start+entry.Name())
getFiles(start+entry.Name()+"/", entries, wg)
}
}
}
func (f *fileDisplay) DisplaySizeIEC() string {
const unit = 1024
b := f.Size
if b < unit {
return fmt.Sprintf("%dB", b)
}
div, exp := int64(unit), 0
for n := b / unit; n >= unit; n /= unit {
div *= unit
exp++
}
return fmt.Sprintf("%.2f%ciB",
float64(b)/float64(div), "KMGTPE"[exp])
}
Edit: I tried removing the channel and just appending to the slice. This sped it up, but it's not safe because multiple routines could be accessing it.
My final draft involved dropping the channel and using sync.RWMutex to lock the list and a custom append function to append with the lock. This allowed me to drop the channel and use append without risking multiple routines editing the same slice.
I dropped the channel because this was causing routines to stay open until the for loop over the open channel could reach their message. My channek operations were blocking. So the routines caused it to slow to the speed of the for loop iterating over the channel.
You can see the differences here:
package main
import (
"fmt"
"io"
"io/fs"
"log"
"os"
"sort"
"sync"
"github.com/google/fscrypt/filesystem"
"github.com/sirupsen/logrus"
"gopkg.in/alecthomas/kingpin.v2"
)
var (
mountpoint = kingpin.Flag("mount", "The mount to find the largest file usages. Can be a subath of mount").Required().String()
limit = kingpin.Flag("limit", "The maximum number of files return to the display").Default("10").Short('l').Int()
)
var device string
type fileDisplays struct {
sync.RWMutex
Files []fileDisplay
}
var files fileDisplays
type fileDisplay struct {
Size int64
Path string
}
type bySize []fileDisplay
func (a bySize) Len() int { return len(a) }
func (a bySize) Less(i, j int) bool { return a[i].Size < a[j].Size }
func (a bySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func main() {
log.SetOutput(io.Discard)
kingpin.Version("0.0.1")
kingpin.Parse()
//Define limit after parsing
logrus.SetLevel(logrus.FatalLevel)
if (*mountpoint)[len(*mountpoint)-1:] != "/" {
*mountpoint = *mountpoint + "/"
}
fmt.Println("Finding the top", *limit, "largest files on filesystem", *mountpoint, "\n================================================")
mount, err := filesystem.FindMount(*mountpoint)
if err != nil {
logrus.Fatal(err)
}
device = mount.Device
entries, err := os.ReadDir(*mountpoint)
if err != nil {
logrus.Fatal(err)
}
var wg sync.WaitGroup
getFiles(*mountpoint, entries, &wg)
wg.Wait()
sort.Sort(bySize(files.Files))
var shortFiles []fileDisplay
if len(files.Files) > *limit {
shortFiles = files.Files[len(files.Files)-*limit:]
} else {
shortFiles = files.Files
}
for _, file := range shortFiles {
fmt.Println(file.Path, file.DisplaySizeIEC())
}
}
func getFiles(start string, entries []fs.DirEntry, wg *sync.WaitGroup) {
for _, entry := range entries {
wg.Add(1)
go handleEntry(start, entry, wg)
}
}
func handleEntry(start string, entry fs.DirEntry, wg *sync.WaitGroup) {
defer wg.Done()
var file fileDisplay
mount, err := filesystem.FindMount(start + entry.Name())
if err != nil {
logrus.Errorln(err, start+entry.Name())
return
}
if mount.Device == device {
if entry.Type().IsRegular() {
fileInfo, err := os.Stat(start + entry.Name())
if err != nil {
logrus.Errorln(err, start+entry.Name())
return
}
file.Path = start + entry.Name()
file.Size = fileInfo.Size()
files.Append(file)
} else if entry.IsDir() {
entries, err := os.ReadDir(start + entry.Name())
if err != nil {
logrus.Errorln(err, start+entry.Name())
return
}
logrus.Info("Searching ", start+entry.Name())
getFiles(start+entry.Name()+"/", entries, wg)
}
}
}
func (f *fileDisplay) DisplaySizeIEC() string {
const unit = 1024
b := f.Size
if b < unit {
return fmt.Sprintf("%dB", b)
}
div, exp := int64(unit), 0
for n := b / unit; n >= unit; n /= unit {
div *= unit
exp++
}
return fmt.Sprintf("%.2f%ciB",
float64(b)/float64(div), "KMGTPE"[exp])
}
func (fd *fileDisplays) Append(item fileDisplay) {
fd.Lock()
defer fd.Unlock()
fd.Files = append(fd.Files, item)
}

Unable to loop through golang dynamic channels

I want to loop through the menu's options. However, it stops at the first option, since the select without "default:" is blocking and it does not know more options will appear dynamically.
Bellow is the broken code:
package main
import (
"bytes"
"fmt"
"io/ioutil"
"log"
"os/exec"
"strings"
"time"
"github.com/getlantern/systray"
"gopkg.in/yaml.v3"
)
var menuItensPtr []*systray.MenuItem
var config map[string]string
var commands []string
func main() {
config = readconfig()
systray.Run(onReady, onExit)
}
func onReady() {
systray.SetIcon(getIcon("assets/menu.ico"))
menuItensPtr = make([]*systray.MenuItem,0)
commands = make([]string,0)
for k, v := range config {
menuItemPtr := systray.AddMenuItem(k, k)
menuItensPtr = append(menuItensPtr, menuItemPtr)
commands = append(commands, v)
}
systray.AddSeparator()
// mQuit := systray.AddMenuItem("Quit", "Quits this app")
go func() {
for {
systray.SetTitle("My tray menu")
systray.SetTooltip("https://github.com/evandrojr/my-tray-menu")
time.Sleep(1 * time.Second)
}
}()
go func() {
for{
for i, menuItenPtr := range menuItensPtr {
select {
/// EXECUTION GETS STUCK HERE!!!!!!!
case<-menuItenPtr.ClickedCh:
execute(commands[i])
}
}
// select {
// case <-mQuit.ClickedCh:
// systray.Quit()
// return
// // default:
// }
}
}()
}
func onExit() {
// Cleaning stuff will go here.
}
func getIcon(s string) []byte {
b, err := ioutil.ReadFile(s)
if err != nil {
fmt.Print(err)
}
return b
}
func execute(commands string){
command_array:= strings.Split(commands, " ")
command:=""
command, command_array = command_array[0], command_array[1:]
cmd := exec.Command(command, command_array ...)
var out bytes.Buffer
cmd.Stdout = &out
err := cmd.Run()
if err != nil {
log.Fatal(err)
}
// fmt.Printf("Output %s\n", out.String())
}
func readconfig() map[string]string{
yfile, err := ioutil.ReadFile("my-tray-menu.yaml")
if err != nil {
log.Fatal(err)
}
data := make(map[string]string)
err2 := yaml.Unmarshal(yfile, &data)
if err2 != nil {
log.Fatal(err2)
}
for k, v := range data {
fmt.Printf("%s -> %s\n", k, v)
}
return data
}
Bellow is the ugly workaround that works:
package main
import (
"bytes"
"fmt"
"io/ioutil"
"log"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
"github.com/getlantern/systray"
"gopkg.in/yaml.v3"
)
var menuItensPtr []*systray.MenuItem
var config map[string]string
var commands []string
var labels []string
var programPath string
func main() {
setProgramPath()
config = readconfig()
time.Sleep(1 * time.Second)
systray.Run(onReady, onExit)
}
func onReady() {
systray.SetIcon(getIcon(filepath.Join(programPath,"assets/menu.ico")))
menuItensPtr = make([]*systray.MenuItem, 0)
i := 0
op0 := systray.AddMenuItem(labels[i], commands[i])
i++
op1 := systray.AddMenuItem(labels[i], commands[i])
i++
op2 := systray.AddMenuItem(labels[i], commands[i])
i++
op3 := systray.AddMenuItem(labels[i], commands[i])
i++
systray.AddSeparator()
mQuit := systray.AddMenuItem("Quit", "Quits this app")
go func() {
for {
systray.SetTitle("My tray menu")
systray.SetTooltip("https://github.com/evandrojr/my-tray-menu")
time.Sleep(1 * time.Second)
}
}()
go func() {
for {
select {
// HERE DOES NOT GET STUCK!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
case <-op0.ClickedCh:
execute(commands[0])
case <-op1.ClickedCh:
execute(commands[1])
case <-op2.ClickedCh:
execute(commands[2])
case <-op3.ClickedCh:
execute(commands[3])
case <-mQuit.ClickedCh:
systray.Quit()
return
}
}
}()
}
func onExit() {
// Cleaning stuff will go here.
}
func getIcon(s string) []byte {
b, err := ioutil.ReadFile(s)
if err != nil {
fmt.Print(err)
}
return b
}
func setProgramPath(){
ex, err := os.Executable()
if err != nil {
panic(err)
}
programPath = filepath.Dir(ex)
if err != nil {
fmt.Println(err)
os.Exit(1)
}
}
func execute(commands string) {
command_array := strings.Split(commands, " ")
command := ""
command, command_array = command_array[0], command_array[1:]
cmd := exec.Command(command, command_array...)
var out bytes.Buffer
cmd.Stdout = &out
err := cmd.Run()
if err != nil {
log.Fatal(err)
}
fmt.Printf("Output %s\n", out.String())
}
func readconfig() map[string]string {
yfile, err := ioutil.ReadFile(filepath.Join(programPath,"my-tray-menu.yaml"))
if err != nil {
log.Fatal(err)
}
data := make(map[string]string)
err2 := yaml.Unmarshal(yfile, &data)
if err2 != nil {
log.Fatal(err2)
}
labels = make([]string, 0)
commands = make([]string, 0)
for k, v := range data {
labels = append(labels, k)
commands = append(commands, v)
fmt.Printf("%s -> %s\n", k, v)
}
fmt.Print(len(labels))
return data
}
Full source code here:
https://github.com/evandrojr/my-tray-menu
select "chooses which of a set of possible send or receive operations will proceed". The spec sets out how this choice is made:
If one or more of the communications can proceed, a single one that can proceed is chosen via a uniform pseudo-random selection. Otherwise, if there is a default case, that case is chosen. If there is no default case, the "select" statement blocks until at least one of the communications can proceed.
Your working example:
select {
case <-op0.ClickedCh:
execute(commands[0])
case <-op1.ClickedCh:
execute(commands[1])
// ...
}
uses select successfully to choose between one of the offered options. However if you pass a single option e.g.
select {
case<-menuItenPtr.ClickedCh:
execute(commands[i])
}
}
The select will block until <-menuItenPtr.ClickedCh is ready to proceed (e.g. something is received). This is effectively the same as not using a select:
<-menuItenPtr.ClickedCh:
execute(commands[i])
The result you were expecting can be achieved by providing a default option:
select {
case<-menuItenPtr.ClickedCh:
execute(commands[i])
}
default:
}
As per the quote from the spec above the default option will be chosen if none of the other options can proceed. While this may work it's not a very good solution because you effectively end up with:
for {
// Check if event happened (not blocking)
}
This will tie up CPU time unnecessarily as it continually loops checking for events. A better solution would be to start a goroutine to monitor each channel:
for i, menuItenPtr := range menuItensPtr {
go func(c chan struct{}, cmd string) {
for range c { execute(cmd) }
}(menuItenPtr.ClickedCh, commands[i])
}
// Start another goroutine to handle quit
The above will probably work but does lead to the possibility that execute will be called concurrently (which might cause issues if your code is not threadsafe). One way around this is to use the "fan in" pattern (as suggested by #kostix and in the Rob Pike video suggested by #John); something like:
cmdChan := make(chan int)
for i, menuItenPtr := range menuItensPtr {
go func(c chan struct{}, cmd string) {
for range c { cmdChan <- cmd }
}(menuItenPtr.ClickedCh, commands[i])
}
go func() {
for {
select {
case cmd := <- cmdChan:
execute(cmd) // Handle command
case <-mQuit.ClickedCh:
systray.Quit()
return
}
}
}()
note: all code above entered directly into the question so please treat as pseudo code!

Is it a better way to do parallel programming that this?

I made this script for getting the follower count of "influencers" from instagram
the "runtime" number I am getting from it is between 550-750ms.
It is not that bad, but I am wondering whether it could be better or not (as I am a golang noob - learning it 3 weeks only)
package main
import (
"encoding/json"
"fmt"
"io/ioutil"
"log"
"net/http"
"sync"
"time"
)
type user struct {
User userData `json:"user"`
}
type userData struct {
Followers count `json:"followed_by"`
}
type count struct {
Count int `json:"count"`
}
func getFollowerCount(in <-chan string) <-chan int {
out := make(chan int)
go func() {
for un := range in {
URL := "https://www.instagram.com/" + un + "/?__a=1"
resp, err := http.Get(URL)
if err != nil {
// handle error
fmt.Println(err)
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
var u user
err = json.Unmarshal(body, &u)
if err != nil {
fmt.Println(err)
}
// return u.User.Followers.Count
out <- u.User.Followers.Count
}
close(out)
}()
return out
}
func merge(cs ...<-chan int) <-chan int {
var wg sync.WaitGroup
out := make(chan int)
output := func(c <-chan int) {
for n := range c {
out <- n
}
wg.Done()
}
wg.Add(len(cs))
for _, c := range cs {
go output(c)
}
go func() {
wg.Wait()
close(out)
}()
return out
}
func gen(users ...string) <-chan string {
out := make(chan string)
go func() {
for _, u := range users {
out <- u
}
close(out)
}()
return out
}
func main() {
start := time.Now()
fmt.Println("STARTING UP")
usrs := []string{"kanywest", "kimkardashian", "groovyq", "kendricklamar", "barackobama", "asaprocky", "champagnepapi", "eminem", "drdre", "g_eazy", "skrillex"}
in := gen(usrs...)
d1 := getFollowerCount(in)
d2 := getFollowerCount(in)
d3 := getFollowerCount(in)
d4 := getFollowerCount(in)
d5 := getFollowerCount(in)
d6 := getFollowerCount(in)
d7 := getFollowerCount(in)
d8 := getFollowerCount(in)
d9 := getFollowerCount(in)
d10 := getFollowerCount(in)
for d := range merge(d1, d2, d3, d4, d5, d6, d7, d8, d9, d10) {
fmt.Println(d)
}
elapsed := time.Since(start)
log.Println("runtime", elapsed)
}
I agree with jeevatkm, there are numerous way to implement your task and improve it. Some notes:
Separate the function that actually do the job (i.e. fetch result from remote service) and the function which is responsible for coordinating all the jobs.
It is a good practice to propagate an errorto the caller instead of consumes (handles) it in a function to be called.
Since the jobs are done in parallel, the result could be returned in undetermined order. Thus, besides follower count, result should contains other related information(s).
The following implementation may be one alternative:
package main
import (
"encoding/json"
"errors"
"fmt"
"net/http"
"sync"
"time"
)
type user struct {
User userData `json:"user"`
}
type userData struct {
Followers count `json:"followed_by"`
}
type count struct {
Count int `json:"count"`
}
//Wrap username, count, and error. See (3) above.
type follower struct {
Username string
Count int
Error error
}
//GetFollowerCountFunc is a function for
//fetching follower count of a specific user.
type GetFollowerCountFunc func(string) (int, error)
//Mockup function for test
func mockGetFollowerCountFor(userName string) (int, error) {
if len(userName) < 9 {
return -1, errors.New("mocking error in get follower count")
}
return 10, nil
}
//Fetch result from remote service. See (1) above.
func getFollowerCountFor(userName string) (int, error) {
URL := "https://www.instagram.com/" + userName + "/?__a=1"
resp, err := http.Get(URL)
if err != nil {
return -1, err
}
defer resp.Body.Close()
var u user
if err := json.NewDecoder(resp.Body).Decode(&u); err != nil {
return -1, err
}
return u.User.Followers.Count, nil
}
//Function that coordinates/distributes the jobs. See (1), (2) above.
func getFollowersAsync(users []string, fn GetFollowerCountFunc) <-chan follower {
//allocate channels for storing result
//number of allocated channels define the maximum *parallel* worker
followers := make(chan follower, len(users))
//The following is also valid
//followers := make(chan follower, 5)
//Do the job distribution in goroutine (Asynchronously)
go func() {
var wg sync.WaitGroup
wg.Add(len(users))
for _, u := range users {
//Run a *parallel* worker
go func(uid string) {
cnt, err := fn(uid)
if err != nil {
followers <- follower{uid, -1, err}
} else {
followers <- follower{uid, cnt, nil}
}
wg.Done()
}(u)
}
//wait all workers finish
wg.Wait()
//close the channels so the `for ... range` will exit gracefully
close(followers)
}()
//This function will returns immediately
return followers
}
func main() {
start := time.Now()
fmt.Println("STARTING UP")
usrs := []string{"kanywest", "kimkardashian", "groovyq", "kendricklamar", "barackobama", "asaprocky", "champagnepapi", "eminem", "drdre", "g_eazy", "skrillex"}
results := getFollowersAsync(usrs, getFollowerCountFor)
//For TESTING:
//results := getFollowersAsync(usrs, mockGetFollowerCountFor)
for r := range results {
if r.Error != nil {
fmt.Printf("Error for user '%s' => %v", r.Username, r.Error)
} else {
fmt.Printf("%s: %d\n", r.Username, r.Count)
}
}
elapsed := time.Since(start)
fmt.Println("runtime", elapsed)
}
Welcome to Go, happy learning.
You're doing good, you can improve your program many ways (such as json decoder, less no of chan, etc). Following is one of the approach. Execution time is between 352-446ms (take it with grain of salt, since network call is involved in your code. Might vary based on server response time).
Your updated code:
package main
import (
"encoding/json"
"fmt"
"log"
"net/http"
"sync"
"time"
)
type user struct {
User userData `json:"user"`
}
type userData struct {
Followers count `json:"followed_by"`
}
type count struct {
Count int `json:"count"`
}
func getFollowerCount(username string, result chan<- int, wg *sync.WaitGroup) {
defer wg.Done()
reqURL := "https://www.instagram.com/" + username + "/?__a=1"
resp, err := http.Get(reqURL)
if err != nil {
log.Println(err)
return
}
defer resp.Body.Close()
var u user
if err := json.NewDecoder(resp.Body).Decode(&u); err != nil {
log.Println(err)
return
}
result <- u.User.Followers.Count
}
func execute(users []string, result chan<- int) {
wg := &sync.WaitGroup{}
for _, username := range users {
wg.Add(1)
go getFollowerCount(username, result, wg)
}
wg.Wait()
result <- -1
}
func main() {
start := time.Now()
fmt.Println("STARTING UP")
usrs := []string{"kanywest", "kimkardashian", "groovyq", "kendricklamar", "barackobama", "asaprocky", "champagnepapi", "eminem", "drdre", "g_eazy", "skrillex"}
result := make(chan int)
go execute(usrs, result)
for v := range result {
if v == -1 {
break
}
fmt.Println(v)
}
elapsed := time.Since(start)
fmt.Println("runtime:", elapsed)
}

How to close a channel

I try to adapt this example:
https://gobyexample.com/worker-pools
But I don't know how to stop the channel because program don't exit at the end of the channel loop.
Can you explain how to exit the program?
package main
import (
"github.com/SlyMarbo/rss"
"bufio"
"fmt"
"log"
"os"
)
func readLines(path string) ([]string, error) {
file, err := os.Open(path)
if err != nil {
return nil, err
}
defer file.Close()
var lines []string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
lines = append(lines, scanner.Text())
}
return lines, scanner.Err()
}
func worker(id int, jobs <-chan string, results chan<- string) {
for url := range jobs {
fmt.Println("worker", id, "processing job", url)
feed, err := rss.Fetch(url)
if err != nil {
fmt.Println("Error on: ", url)
continue
}
borne := 0
for _, value := range feed.Items {
if borne < 5 {
results <- value.Link
borne = borne +1
} else {
continue
}
}
}
}
func main() {
jobs := make(chan string)
results := make(chan string)
for w := 1; w <= 16; w++ {
go worker(w, jobs, results)
}
urls, err := readLines("flux.txt")
if err != nil {
log.Fatalf("readLines: %s", err)
}
for _, url := range urls {
jobs <- url
}
close(jobs)
// it seems program runs over...
for msg := range results {
fmt.Println(msg)
}
}
The flux.txt is a flat text file like :
http://blog.case.edu/news/feed.atom
...
The problem is that, in the example you are referring to, the worker pool reads from results 9 times:
for a := 1; a <= 9; a++ {
<-results
}
Your program, on the other hand, does a range loop over the results which has a different semantics in go. The range operator does not stop until the channel is closed.
for msg := range results {
fmt.Println(msg)
}
To fix your problem you'd need to close the results channel. However, if you just call close(results) before the for loop, you most probably will
get a panic, because the workers might be writing on results.
To fix this problem, you need to add another channel to be notified when all the workers are done. You can do this either using a sync.WaitGroup or :
const (
workers = 16
)
func main() {
jobs := make(chan string, 100)
results := make(chan string, 100)
var wg sync.WaitGroup
for w := 0; w < workers; w++ {
go func() {
wg.Add(1)
defer wg.Done()
worker(w, jobs, results)
}()
}
urls, err := readLines("flux.txt")
if err != nil {
log.Fatalf("readLines: %s", err)
}
for _, url := range urls {
jobs <- url
}
close(jobs)
wg.Wait()
close(results)
// it seems program runs over...
for msg := range results {
fmt.Println(msg)
}
}
Or a done channel:
package main
import (
"bufio"
"fmt"
"github.com/SlyMarbo/rss"
"log"
"os"
)
func readLines(path string) ([]string, error) {
file, err := os.Open(path)
if err != nil {
return nil, err
}
defer file.Close()
var lines []string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
lines = append(lines, scanner.Text())
}
return lines, scanner.Err()
}
func worker(id int, jobs <-chan string, results chan<- string, done chan struct{}) {
for url := range jobs {
fmt.Println("worker", id, "processing job", url)
feed, err := rss.Fetch(url)
if err != nil {
fmt.Println("Error on: ", url)
continue
}
borne := 0
for _, value := range feed.Items {
if borne < 5 {
results <- value.Link
borne = borne + 1
} else {
continue
}
}
}
close(done)
}
const (
workers = 16
)
func main() {
jobs := make(chan string, 100)
results := make(chan string, 100)
dones := make([]chan struct{}, workers)
for w := 0; w < workers; w++ {
dones[w] = make(chan struct{})
go worker(w, jobs, results, dones[w])
}
urls, err := readLines("flux.txt")
if err != nil {
log.Fatalf("readLines: %s", err)
}
for _, url := range urls {
jobs <- url
}
close(jobs)
for _, done := range dones {
<-done
}
close(results)
// it seems program runs over...
for msg := range results {
fmt.Println(msg)
}
}

inconsistent results using golang channels

I a task written in Go to get a unique list from a bunch of text files. I put in some parallelization using channels and am having inconsistent results now - a variance of 5 records output/not output each time with the same input files.
The am testing it with go run process.go | wc -l on Fedora x86_64, go1.1.2, 8 core amd.
The code is:
package main
import (
"fmt"
"os"
"io"
"encoding/csv"
"regexp"
"log"
)
var (
cleanRe *regexp.Regexp = regexp.MustCompile("[^0-9]+")
comma rune ='\t'
fieldsPerRecord=-1
)
func clean(s string) string {
clean:=cleanRe.ReplaceAllLiteralString(s,"")
if len(clean)<6 {return ""}
return clean
}
func uniqueChannel(inputChan chan []string, controlChan chan string) {
defer func(){controlChan<-"Input digester."}()
uniq:=make(map[string]map[string]bool)
i:=0
for record:= range inputChan {
i++
id,v:=record[0],record[1]
if uniq[id]==nil {
uniq[id]=make(map[string]bool)
} else if !uniq[id][v] {
uniq[id][v]=true
fmt.Println(id,string(comma),v)
}
}
log.Println("digest ", i)
}
func processFile(fileName string, outputChan chan []string, controlChan chan string) {
defer func(){controlChan<-fileName}()
f,err:=os.Open(fileName)
if err!=nil{log.Fatal(err)}
r:=csv.NewReader(f)
r.FieldsPerRecord = fieldsPerRecord
r.Comma = comma
// Process the records
i:=0
for record,err:=r.Read();err!=io.EOF;record,err=r.Read() {
if err!=nil{continue}
id:=record[0]
for _,v:=range record[1:] {
if cleanV:=clean(v);cleanV!=""{
i++
outputChan<-[]string{id,cleanV}
}
}
}
log.Println(fileName,i)
}
func main() {
inputs:=[]string{}
recordChan:=make(chan []string,100)
processesLeft:=len(inputs)+1
controlChan:=make(chan string,processesLeft)
// Ingest the inputs
for _,fName:=range inputs {
go processFile(fName,recordChan,controlChan)
}
// This is the loop to ensure it's all unique
go uniqueChannel(recordChan,controlChan)
// Make sure all the channels close up
for processesLeft>0 {
if processesLeft==1{
close(recordChan)
}
c:=<-controlChan
log.Println(c)
processesLeft--
}
close(controlChan)
}
It seems like it closes the channel before it's empty and quite. Without the closing mechanism I was getting deadlocks - I'm out of ideas.
You could ditch the control channel and use a sync.WaitGroup:
package main
import (
"encoding/csv"
"fmt"
"io"
"log"
"os"
"regexp"
"sync"
)
var (
cleanRe *regexp.Regexp = regexp.MustCompile("[^0-9]+")
comma rune = '\t'
fieldsPerRecord = -1
)
func clean(s string) string {
clean := cleanRe.ReplaceAllLiteralString(s, "")
if len(clean) < 6 {
return ""
}
return clean
}
func uniqueChannel(inputChan chan []string) {
uniq := make(map[string]map[string]bool)
i := 0
for record := range inputChan {
i++
id, v := record[0], record[1]
if uniq[id] == nil {
uniq[id] = make(map[string]bool)
} else if !uniq[id][v] {
uniq[id][v] = true
fmt.Println(id, string(comma), v)
}
}
log.Println("digest ", i)
}
func processFile(fileName string, outputChan chan []string) {
f, err := os.Open(fileName)
if err != nil {
log.Fatal(err)
}
r := csv.NewReader(f)
r.FieldsPerRecord = fieldsPerRecord
r.Comma = comma
// Process the records
for record, err := r.Read(); err != io.EOF; record, err = r.Read() {
if err != nil {
continue
}
id := record[0]
for _, v := range record[1:] {
if cleanV := clean(v); cleanV != "" {
outputChan <- []string{id, cleanV}
}
}
}
}
func main() {
inputs := []string{"ex.tsv"}
recordChan := make(chan []string)
var wg sync.WaitGroup
// Ingest the inputs
for _, fName := range inputs {
wg.Add(1)
go func() {
processFile(fName, recordChan)
wg.Done()
}()
}
go func() {
wg.Wait()
close(recordChan)
}()
// This is the loop to ensure it's all unique
uniqueChannel(recordChan)
}

Resources