My understanding is that for an unbuffered channel (jobs) you need both a sender and receiver which I have but something about the order of my code is wrong, it is taking a very long time to complete.
What am I missing here?
const numWorkers = 5
type workerJob struct {
FirstID string
SecondID string
}
func worker(ctx *gin.Context, fs *firestore.Client, jobs <-chan *workerJob, done chan<- bool) {
for job := range jobs {
firstID := job.FirstID
secondID := job.SecondID
if err := saveUpdate(ctx, firstID, secondID, fs); err != nil {
// handle error
}
}
done <- true
}
func UpdateSomething(ctx *gin.Context) {
fs, err := firestore.NewClient(ctx, "some_ID")
if err != nil {
// handle error
}
defer fs.Close()
docsnaps, err := fs.CollectionGroup("someCollection").Where("someCondition", "==", true).Documents(ctx).GetAll()
if err != nil {
// handle error
}
uniqueSomethings := make(map[string]struct{})
jobs := make(chan *workerJob)
done := make(chan bool, numWorkers)
for w := 1; w <= numWorkers; w++ {
go worker(ctx, fs, jobs, done)
}
for _, docsnap := range docsnaps {
var someType SomeType
err := docsnap.DataTo(&someType)
if err != nil {
// handle error
}
for _, prop := range someType.prop {
if strings.Contains(prop.Name, "someString") {
someID := prop.Name[strings.LastIndex(prop.Name, ":")+1:]
if _, ok := uniqueSomethings[someID]; !ok {
uniqueSomethings[someID] = struct{}{}
job := &workerJob{
FirstID: dashboard.CustomerID,
SecondID: someID[strings.Index(someID, "_")+1:],
}
jobs <- job
}
}
}
}
close(jobs)
for i := 1; i <= numWorkers; i++ {
select {
case <-done:
}
}
return
}
I don't fully understand why, but I have been able to bring down the time by a factor of 6! I decided to create a slice of jobs and then create a channel with a predefined capacity - the lenght of that slice, then loop over the slice and send the jobs to the channel here's how it looks:
func UpdateSomething(ctx *gin.Context) {
fs, err := firestore.NewClient(ctx, "some_ID")
if err != nil {
// handle error
}
defer fs.Close()
docsnaps, err := fs.CollectionGroup("someCollection").Where("someCondition", "==", true).Documents(ctx).GetAll()
if err != nil {
// handle error
}
uniqueSomethings := make(map[string]struct{})
jobsArr := make([]*workerJob, 0)
for _, docsnap := range docsnaps {
var someType SomeType
err := docsnap.DataTo(&someType)
if err != nil {
// handle error
}
for _, prop := range someType.prop {
if strings.Contains(prop.Name, "someString") {
someID := prop.Name[strings.LastIndex(prop.Name, ":")+1:]
if _, ok := uniqueSomethings[someID]; !ok {
uniqueSomethings[someID] = struct{}{}
job := &workerJob{
FirstID: dashboard.CustomerID,
SecondID: someID[strings.Index(someID, "_")+1:],
}
jobsArr = append(jobsArr, job)
}
}
}
}
done := make(chan bool, numWorkers)
jobs := make(chan *workerJob, len(jobsArr))
for w := 1; w <= numWorkers; w++ {
go worker(ctx, fs, jobs, done)
}
for _, job := range jobsArr {
jobs <- job
}
close(jobs)
for i := 1; i <= numWorkers; i++ {
select {
case <-done:
}
}
return
}
Related
The following snippet validates a phone number and write the details to CSV.
func Parse(phone Input, output *PhoneNumber) error {
var n PhoneNumber
num, _ := phonenumbers.Parse(phone.Number, phone.Prefix)
n.PhoneNumber = phonenumbers.Format(num, phonenumbers.E164)
n.CountryCode = num.GetCountryCode()
n.PhoneType = phonenumbers.GetNumberType(num)
n.NetworkName, _ = phonenumbers.GetCarrierForNumber(num, "EN")
n.Region = phonenumbers.GetRegionCodeForNumber(num)
*output = n
return nil
}
func createFile(path string) {
// detect if file exists
var _, err = os.Stat(path)
// create file if not exists
if os.IsNotExist(err) {
var file, err = os.Create(path)
if err != nil {
return
}
defer file.Close()
}
}
func worker(ctx context.Context, dst chan string, src chan []string) {
for {
select {
case dataArray, ok := <-src: // you must check for readable state of the channel.
if !ok {
return
}
go processNumber(dataArray[0])
case <-ctx.Done(): // if the context is cancelled, quit.
return
}
}
}
func processNumber(number string) {
num, e := phonenumbers.Parse(number, "")
if e != nil {
return
}
region := phonenumbers.GetRegionCodeForNumber(num)
carrier, _ := phonenumbers.GetCarrierForNumber(num, "EN")
path := "sample_all.csv"
createFile(path)
var csvFile, _ = os.OpenFile(path, os.O_APPEND|os.O_WRONLY, os.ModeAppend)
csvwriter := csv.NewWriter(csvFile)
_ = csvwriter.Write([]string{phonenumbers.Format(num, phonenumbers.E164), fmt.Sprintf("%v", num.GetCountryCode()), fmt.Sprintf("%v", phonenumbers.GetNumberType(num)), carrier, region})
defer csvFile.Close()
csvwriter.Flush()
}
func ParseFile(phone Input, output *PhoneNumber) error {
// create a context
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// that cancels at ctrl+C
go onSignal(os.Interrupt, cancel)
numberOfWorkers := 2
start := time.Now()
csvfile, err := os.Open(phone.File)
if err != nil {
log.Fatal(err)
}
defer csvfile.Close()
reader := csv.NewReader(csvfile)
// create the pair of input/output channels for the controller=>workers com.
src := make(chan []string)
out := make(chan string)
// use a waitgroup to manage synchronization
var wg sync.WaitGroup
// declare the workers
for i := 0; i < numberOfWorkers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
worker(ctx, out, src)
}()
}
// read the csv and write it to src
go func() {
for {
record, err := reader.Read()
if err == io.EOF {
break
} else if err != nil {
log.Fatal(err)
}
src <- record // you might select on ctx.Done().
}
close(src) // close src to signal workers that no more job are incoming.
}()
// wait for worker group to finish and close out
go func() {
wg.Wait() // wait for writers to quit.
close(out) // when you close(out) it breaks the below loop.
}()
// drain the output
for res := range out {
fmt.Println(res)
}
fmt.Printf("\n%2fs", time.Since(start).Seconds())
return nil
}
In processNumber function, if I skip writing to CSV, the process of verifying number completes 6 seconds but writing one record at a time on CSV stretch the time consumption to 15s.
How can I optimize the code?
Can I chunk the records and write them in chunks instead of writing one row at a time?
Do work directly in worker goroutine instead of firing off goroutine per task.
Open file output file once. Flush output file once.
func worker(ctx context.Context, dst chan []string, src chan []string) {
for {
select {
case dataArray, ok := <-src: // you must check for readable state of the channel.
if !ok {
return
}
dst <- processNumber(dataArray[0])
case <-ctx.Done(): // if the context is cancelled, quit.
return
}
}
}
func processNumber(number string) []string {
num, e := phonenumbers.Parse(number, "")
if e != nil {
return
}
region := phonenumbers.GetRegionCodeForNumber(num)
carrier, _ := phonenumbers.GetCarrierForNumber(num, "EN")
return []string{phonenumbers.Format(num, phonenumbers.E164), fmt.Sprintf("%v", num.GetCountryCode()), fmt.Sprintf("%v", phonenumbers.GetNumberType(num)), carrier, region}
}
func ParseFile(phone Input, output *PhoneNumber) error {
// create a context
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// that cancels at ctrl+C
go onSignal(os.Interrupt, cancel)
numberOfWorkers := 2
start := time.Now()
csvfile, err := os.Open(phone.File)
if err != nil {
log.Fatal(err)
}
defer csvfile.Close()
reader := csv.NewReader(csvfile)
// create the pair of input/output channels for the controller=>workers com.
src := make(chan []string)
out := make(chan string)
// use a waitgroup to manage synchronization
var wg sync.WaitGroup
// declare the workers
for i := 0; i < numberOfWorkers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
worker(ctx, out, src)
}()
}
// read the csv and write it to src
go func() {
for {
record, err := reader.Read()
if err == io.EOF {
break
} else if err != nil {
log.Fatal(err)
}
src <- record // you might select on ctx.Done().
}
close(src) // close src to signal workers that no more job are incoming.
}()
// wait for worker group to finish and close out
go func() {
wg.Wait() // wait for writers to quit.
close(out) // when you close(out) it breaks the below loop.
}()
path := "sample_all.csv"
file, err := os.Create(path)
if err != nil {
return err
}
defer file.Close()
csvwriter := csv.NewWriter(csvFile)
// drain the output
for res := range out {
csvwriter.Write(res)
}
csvwriter.Flush()
fmt.Printf("\n%2fs", time.Since(start).Seconds())
return nil
}
I have the following and am trying to make some concurrent http calls so that I can speed up the entire program rather than doing each call 1 by one:
package main
import (
"fmt"
"net/http"
"time"
)
type U struct {
u string
name string
resp *http.Response
}
func main() {
urls := []*U{
&U{"example", "http://www.example.com", nil},
&U{"yahoo", "http://www.yahoo.com", nil},
&U{"google", "http://www.google.com", nil},
}
ch := make(chan *U)
// read from the channel
go func() {
for c := range ch {
for i, u := range urls {
if c.name == u.name {
urls[i] = c
}
}
}
}()
// fetch the stuff
for _, u := range urls {
go func(u *U) {
var err error
u, err = getResponse(u)
if err != nil {
fmt.Println(err)
}
ch <- u
}(u)
}
for i, u := range urls {
fmt.Println(i, u.resp) // all nil
}
}
func getResponse(u *U) (*U, error) {
c := &http.Client{
Timeout: 10 * time.Second,
}
var err error
u.resp, err = c.Get(u.u)
return u, err
}
https://play.golang.org/p/Zko8xkEqDMB
I am ovbiously not doing something right as it prints
0 <nil>
1 <nil>
2 <nil>
whereas it should print the response as well.
How do I make sure I wait until everything is done so that I can move forward? ok. thanks.
You're not waiting for the responses to return. This is equivalent to:
urls := []*U{
&U{"example", "http://www.example.com", nil},
&U{"yahoo", "http://www.yahoo.com", nil},
&U{"google", "http://www.google.com", nil},
}
for i, u := range urls {
fmt.Println(i, u.resp) // all nil
}
Instead you could use a sync.WaitGroup to make sure all the work is done before you display responses:
var wg sync.WaitGroup
for _, u := range urls {
wg.Add(1) // Add job to the waitgroup
go func(u *U) {
var err error
u, err = getResponse(u)
if err != nil {
fmt.Println(err)
}
ch <- u
wg.Done() // Note when the job is done
}(u)
}
wg.Wait() // wait until all the Add'd jobs are Done'd
for i, u := range urls {
fmt.Println(i, u.resp) // all nil
}
Or you could handle printing responses in the same chain as getResponse:
for _, u := range urls {
go func(u *U) {
var err error
u, err = getResponse(u)
if err != nil {
fmt.Println(err)
}
printResponse(u) // implement printResponse however
ch <- u
}(u)
}
I am trying to achieve some sort of multi-thread processing over here.
func (m *Map) Parse(mapData Node) error {
wg := &sync.WaitGroup{}
for _, node := range mapData.child {
wg.Add(1)
go parseChild(node, m, wg)
}
wg.Wait()
close(errors)
return nil
}
func parseChild(node Node, m *Map, wg *sync.WaitGroup) {
defer wg.Done()
var nodeType uint8
if err := binary.Read(node.data, binary.LittleEndian, &nodeType); err != nil {
errors <- err
}
if nodeType == OTBMNodeTowns {
for _, town := range node.child {
var nodeType uint8
if err := binary.Read(town.data, binary.LittleEndian, &nodeType); err != nil {
errors <- err
return
}
if nodeType != OTBMNodeTown {
errors <- fmt.Errorf("Parsing map towns: expected %v got %v", OTBMNodeTown, nodeType)
return
}
currentTown := Town{}
if err := binary.Read(town.data, binary.LittleEndian, ¤tTown.ID); err != nil {
errors <- err
return
} else if currentTown.Name, err = town.ReadString(); err != nil {
errors <- err
return
} else if currentTown.TemplePosition, err = town.ReadPosition(); err != nil {
errors <- err
return
}
m.Towns = append(m.Towns, currentTown)
errors <- fmt.Errorf("This should be called: %v, nodeType)
return
}
}
}
But my goroutine never sends anything to the errors channel. Seems to be that the main thread is not waiting for the goroutines to even finish
I have no idea what I am missing here. Im waiting for all routines to finish using wg.Wait but doesnt seem to be working as I think it should
And yes. the slice is populated with atleast 3 results. This is the errrors channel
var (
errors = make(chan error, 0)
)
func init() {
go errChannel()
}
func errChannel() {
for {
select {
case err := <-errors:
log.Println(err)
}
}
}
Following some #Sam Whited advance and doing some research on stack, i've rewritten my code see below: This version of the code seems more stable, however, it is having issues where every once and a while the i get a slew of TCP errors as if i'm no closing my requests. I've throttled the requests by adding a sleep. It seems to help a bit.
func main() {
runtime.GOMAXPROCS(maxParallelism())
var file = flag.String("f", "", "Enter new line deliminated text file")
var fileName = flag.String("s", "contact_bot.csv", "Enter new line deliminated text file")
flag.Parse()
if *file != "" {
counter := 0
filters = []string{"info", "ads", "sales", "sale", "info", "media", "mediarelations", "media_relations", "contact", "contacts", "contactus", "contact_us", "contact-us", "about_us", "general", "advertise", "support", "systems", "system"}
emailRE = regexp.MustCompile(`([a-z0-9!#$%&'*+\/=?^_{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_{|}~-]+)*(#|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)`)
seedUrls, err := readLines(*file)
checkErr(err)
numberOfUrls := len(seedUrls)
usr, err := user.Current()
checkErr(err)
parentPath := filepath.Join(usr.HomeDir, "/Desktop/"+*fileName)
file, err := os.Create(parentPath)
checkErr(err)
defer file.Close()
writer := csv.NewWriter(file)
defer writer.Flush()
var header = []string{"URL", "EMAILS"}
err = writer.Write(header)
checkErr(err)
data = make(chan *HTTPResponse)
go asyncHTTPGets(seedUrls)
loop:
for result := range data {
counter++
emails := findEmails(result.HTML, filters)
fmt.Printf("%s, %s, %s\n", result.URL, emails, strconv.Itoa(numberOfUrls))
var row = []string{result.URL, strings.Join(emails, ",")}
err := writer.Write(row)
// writer.Flush()
checkErr(err)
if counter == len(seedUrls) {
break loop
}
numberOfUrls--
}
}
}
// AsyncHTTPGets ...
func asyncHTTPGets(urls []string) {
counter := 0
for _, url := range urls {
counter++
if counter%10 == 0 {
time.Sleep(1 * time.Second)
}
go func(url string) {
fmt.Printf("Fetching %s \n", url)
resp, err := http.Get(url)
if err != nil {
fmt.Println(err.Error())
data <- &HTTPResponse{url, err.Error()}
return
}
b := resp.Body
buf := new(bytes.Buffer)
buf.ReadFrom(b)
resp.Body.Close()
myHTML := buf.String()
data <- &HTTPResponse{url, myHTML}
}(url)
}
}
func findEmails(html string, filters []string) []string {
emails := emailRE.FindAllString(html, -1)
filteredEmails := []string{}
for _, email := range emails {
if stringInSlice(email, filters) {
if !stringInSlice(email, filteredEmails) {
filteredEmails = append(filteredEmails, email)
}
}
}
sort.Strings(filteredEmails)
return filteredEmails
}
The application will open a large number of sockets and possibly breach file descriptor limits. I suggest limiting the number of concurrent requests to prevent this issue:
var (
requestMu sync.Mutex // protects requestCount
requestCount int // incremented on each request
)
// Create 10 workers. Adjust up or down as needed.
for w := 0; w < 10; w++ {
go func() {
for {
// Increment request count. Exit at end.
requestMu.Lock()
i := requestCount
requestCount++
requestMu.Unlock()
if i >= len(seedUrls) {
return
}
// Fetch the current URL.
myURL := seedUrls[i]
resp, err := http.Get(myUrl)
if err != nil {
fmt.Println(myURL, err.Error(), i)
data <- &HTTPResponse{myURL, err.Error()}
continue
}
// Read body and close.
b, err := ioutil.ReadAll(resp.Body)
resp.Body.Close()
if err != nil {
fmt.Println(myURL, err.Error(), i)
data <- &HTTPResponse{myURL, err.Error()}
continue
}
myHTML := string(b)
data <- &HTTPResponse{myURL, myHTML}
}
}()
}
// Recieve expected number of results
for i := 0; i < len(seedUrls); i++ {
result <- data
emails := findEmails(result.HTML, filters)
fmt.Printf("%s, %s, %d\n", result.URL, emails, i)
var row = []string{result.URL, strings.Join(emails, ",")}
err := writer.Write(row)
writer.Flush()
if err != nil {
panic(err)
}
}
I try to adapt this example:
https://gobyexample.com/worker-pools
But I don't know how to stop the channel because program don't exit at the end of the channel loop.
Can you explain how to exit the program?
package main
import (
"github.com/SlyMarbo/rss"
"bufio"
"fmt"
"log"
"os"
)
func readLines(path string) ([]string, error) {
file, err := os.Open(path)
if err != nil {
return nil, err
}
defer file.Close()
var lines []string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
lines = append(lines, scanner.Text())
}
return lines, scanner.Err()
}
func worker(id int, jobs <-chan string, results chan<- string) {
for url := range jobs {
fmt.Println("worker", id, "processing job", url)
feed, err := rss.Fetch(url)
if err != nil {
fmt.Println("Error on: ", url)
continue
}
borne := 0
for _, value := range feed.Items {
if borne < 5 {
results <- value.Link
borne = borne +1
} else {
continue
}
}
}
}
func main() {
jobs := make(chan string)
results := make(chan string)
for w := 1; w <= 16; w++ {
go worker(w, jobs, results)
}
urls, err := readLines("flux.txt")
if err != nil {
log.Fatalf("readLines: %s", err)
}
for _, url := range urls {
jobs <- url
}
close(jobs)
// it seems program runs over...
for msg := range results {
fmt.Println(msg)
}
}
The flux.txt is a flat text file like :
http://blog.case.edu/news/feed.atom
...
The problem is that, in the example you are referring to, the worker pool reads from results 9 times:
for a := 1; a <= 9; a++ {
<-results
}
Your program, on the other hand, does a range loop over the results which has a different semantics in go. The range operator does not stop until the channel is closed.
for msg := range results {
fmt.Println(msg)
}
To fix your problem you'd need to close the results channel. However, if you just call close(results) before the for loop, you most probably will
get a panic, because the workers might be writing on results.
To fix this problem, you need to add another channel to be notified when all the workers are done. You can do this either using a sync.WaitGroup or :
const (
workers = 16
)
func main() {
jobs := make(chan string, 100)
results := make(chan string, 100)
var wg sync.WaitGroup
for w := 0; w < workers; w++ {
go func() {
wg.Add(1)
defer wg.Done()
worker(w, jobs, results)
}()
}
urls, err := readLines("flux.txt")
if err != nil {
log.Fatalf("readLines: %s", err)
}
for _, url := range urls {
jobs <- url
}
close(jobs)
wg.Wait()
close(results)
// it seems program runs over...
for msg := range results {
fmt.Println(msg)
}
}
Or a done channel:
package main
import (
"bufio"
"fmt"
"github.com/SlyMarbo/rss"
"log"
"os"
)
func readLines(path string) ([]string, error) {
file, err := os.Open(path)
if err != nil {
return nil, err
}
defer file.Close()
var lines []string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
lines = append(lines, scanner.Text())
}
return lines, scanner.Err()
}
func worker(id int, jobs <-chan string, results chan<- string, done chan struct{}) {
for url := range jobs {
fmt.Println("worker", id, "processing job", url)
feed, err := rss.Fetch(url)
if err != nil {
fmt.Println("Error on: ", url)
continue
}
borne := 0
for _, value := range feed.Items {
if borne < 5 {
results <- value.Link
borne = borne + 1
} else {
continue
}
}
}
close(done)
}
const (
workers = 16
)
func main() {
jobs := make(chan string, 100)
results := make(chan string, 100)
dones := make([]chan struct{}, workers)
for w := 0; w < workers; w++ {
dones[w] = make(chan struct{})
go worker(w, jobs, results, dones[w])
}
urls, err := readLines("flux.txt")
if err != nil {
log.Fatalf("readLines: %s", err)
}
for _, url := range urls {
jobs <- url
}
close(jobs)
for _, done := range dones {
<-done
}
close(results)
// it seems program runs over...
for msg := range results {
fmt.Println(msg)
}
}