Go Colly parallelism decreases the number of links scraped

Go Colly parallelism decreases the number of links scraped - go

I am trying to build a web scrapper to scrape jobs from internshala.com. I am using go colly to build the web scrapper. I visit every page and then visit the subsequent links of each job to scrape data from. Doing this in a sequential manner scrapes almost all the links, but if I try doing it by using colly's parallel scrapping the number of links scraped decreases. I write all the data in a csv file.
EDIT
My question is why does this happen while scrapping parallelly and how can I solve it (how can I scrape all the data even when scrapping parallelly ).
Or is there something else I am doing wrong that is cauing the problem. A code review will be really helpful. Thanks :)
package main
import (
"encoding/csv"
"log"
"os"
"strconv"
"sync"
"time"
"github.com/gocolly/colly"
)
func main(){
parallel(10)
seq(10)
}
I comment out one of the two functions before running for obvious reasons.
parallel function :=
func parallel(n int){
start := time.Now()
c := colly.NewCollector(
colly.AllowedDomains("internshala.com", "https://internshala.com/internship/detail",
"https://internshala.com/internship/", "internshala.com/", "www.intershala.com"),
colly.Async(true),
)
d := colly.NewCollector(
colly.AllowedDomains("internshala.com", "https://internshala.com/internship/detail",
"https://internshala.com/internship/", "internshala.com/", "www.intershala.com"),
colly.Async(true),
)
c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 4})
d.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 4})
fileName := "data.csv"
file, err := os.Create(fileName)
cnt := 0
if err != nil{
log.Fatalf("Could not create file, err: %q", err)
return
}
defer file.Close() // close the file after the main routine exits
writer := csv.NewWriter(file)
defer writer.Flush()
var wg sync.WaitGroup
c.OnHTML("a[href]", func(e *colly.HTMLElement){
if e.Attr("class") != "view_detail_button"{
return
}
detailsLink := e.Attr("href")
d.Visit(e.Request.AbsoluteURL(detailsLink))
})
d.OnHTML(".detail_view", func(e *colly.HTMLElement) {
wg.Add(1)
go func(wg *sync.WaitGroup) {
writer.Write([]string{
e.ChildText("span.profile_on_detail_page"),
e.ChildText(".company_name a"),
e.ChildText("#location_names a"),
e.ChildText(".internship_other_details_container > div:first-of-type > div:last-of-type .item_body"),
e.ChildText("span.stipend"),
e.ChildText(".applications_message"),
e.ChildText(".internship_details > div:nth-last-of-type(3)"),
e.Request.URL.String(),
})
wg.Done()
}(&wg)
})
c.OnRequest(func(r *colly.Request) {
log.Println("visiting", r.URL.String())
})
d.OnRequest(func(r *colly.Request) {
log.Println("visiting", r.URL.String())
cnt++
})
for i := 1; i < n; i++ {
c.Visit("https://internshala.com/internships/page-"+strconv.Itoa(i))
}
c.Wait()
d.Wait()
wg.Wait()
t := time.Since(start)
log.Printf("time %v \n", t)
log.Printf("amount %v \n", cnt)
log.Printf("Scrapping complete")
log.Println(c)
}
seq function :=
func seq(n int){
start := time.Now()
c := colly.NewCollector(
colly.AllowedDomains("internshala.com", "https://internshala.com/internship/detail",
"https://internshala.com/internship/", "internshala.com/", "www.intershala.com"),
)
d := colly.NewCollector(
colly.AllowedDomains("internshala.com", "https://internshala.com/internship/detail",
"https://internshala.com/internship/", "internshala.com/", "www.intershala.com"),
)
fileName := "data.csv"
file, err := os.Create(fileName)
cnt := 0
if err != nil{
log.Fatalf("Could not create file, err: %q", err)
return
}
defer file.Close() // close the file after the main routine exits
writer := csv.NewWriter(file)
defer writer.Flush()
c.OnHTML("a[href]", func(e *colly.HTMLElement){
if e.Attr("class") != "view_detail_button"{
return
}
detailsLink := e.Attr("href")
d.Visit(e.Request.AbsoluteURL(detailsLink))
})
d.OnHTML(".detail_view", func(e *colly.HTMLElement) {
writer.Write([]string{
e.ChildText("span.profile_on_detail_page"),
e.ChildText(".company_name a"),
e.ChildText("#location_names a"),
e.ChildText(".internship_other_details_container > div:first-of-type > div:last-of-type .item_body"),
e.ChildText("span.stipend"),
e.ChildText(".applications_message"),
e.ChildText(".internship_details > div:nth-last-of-type(3)"),
e.Request.URL.String(),
})
})
c.OnRequest(func(r *colly.Request) {
log.Println("visiting", r.URL.String())
})
d.OnRequest(func(r *colly.Request) {
log.Println("visiting", r.URL.String())
cnt++
})
for i := 1; i < n; i++ {
// Add URLs to the queue
c.Visit("https://internshala.com/internships/page-"+strconv.Itoa(i))
}
t := time.Since(start)
log.Printf("time %v \n", t)
log.Printf("amount %v \n", cnt)
log.Printf("Scrapping complete")
log.Println(c)
}
Any help will be much appreciated. :)

Sorry for being late at the party but I came up with a working solution to your problem. Let me show it:
package main
import (
"encoding/csv"
"fmt"
"log"
"os"
"strconv"
"strings"
"time"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/queue"
)
func parallel(n int) {
start := time.Now()
cnt := 0
queue, _ := queue.New(8, &queue.InMemoryQueueStorage{MaxSize: 1000}) // tried up to 8 threads
fileName := "data_par.csv"
file, err := os.Create(fileName)
if err != nil {
log.Fatalf("Could not create file, err: %q", err)
return
}
defer file.Close() // close the file after the main routine exits
writer := csv.NewWriter(file)
defer func() {
writer.Flush()
if err := writer.Error(); err != nil {
panic(err)
}
}()
c := colly.NewCollector(
colly.AllowedDomains("internshala.com", "https://internshala.com/internship/detail",
"https://internshala.com/internship/", "internshala.com/", "www.intershala.com"),
)
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
if e.Attr("class") != "view_detail_button" {
return
}
detailsLink := e.Attr("href")
e.Request.Visit(detailsLink)
})
c.OnRequest(func(r *colly.Request) {
writer.Write([]string{r.URL.String()})
})
for i := 1; i < n; i++ {
queue.AddURL("https://internshala.com/internships/page-" + strconv.Itoa(i))
}
queue.Run(c)
t := time.Since(start)
log.Printf("time: %v\tamount: %d\n", t, cnt)
}
func seq(n int) {
start := time.Now()
c := colly.NewCollector(
colly.AllowedDomains("internshala.com", "https://internshala.com/internship/detail",
"https://internshala.com/internship/", "internshala.com/", "www.intershala.com"),
)
fileName := "data_seq.csv"
file, err := os.Create(fileName)
cnt := 0
if err != nil {
log.Fatalf("Could not create file, err: %q", err)
return
}
defer file.Close() // close the file after the main routine exits
writer := csv.NewWriter(file)
defer func() {
writer.Flush()
if err := writer.Error(); err != nil {
panic(err)
}
}()
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
if e.Attr("class") != "view_detail_button" {
return
}
detailsLink := e.Attr("href")
e.Request.Visit(detailsLink)
})
c.OnRequest(func(r *colly.Request) {
writer.Write([]string{r.URL.String()})
})
for i := 1; i < n; i++ {
c.Visit("https://internshala.com/internships/page-" + strconv.Itoa(i))
}
t := time.Since(start)
log.Printf("time: %v\tamount: %d\n", t, cnt)
}
func main() {
fmt.Println("sequential")
seq(6)
fmt.Println(strings.Repeat("#", 50))
fmt.Println("parallel")
parallel(6)
}
The problem
After looking at your code, I think that everything is implemented correctly. Sure things could be done in a better way but at least about the concurrency everything is set up properly. Some aspects that you could have improved are in the following list:
Check for the Error while flushing to the underlying CSV file
Use only one collector instead of two
Again, as I already said, these are only small refinements.
The actual problem
The actual problem is that when you make concurrent (and potentially parallel) requests, the colly framework cannot keep up with it and starts losing some responses. This trend grows exponentially when you increase the number of executions.
The easiest solution (IMO)
gocolly provides the Queue type that fits very well for these challenges. Thanks to them, you'll be sure that every request will be processed as if they've been done concurrently. The steps can be summarized as follows:
Instantiate a new queue with the New function provided by the queue sub-package. You've to set up the number of threads and also the type of queue (in our case it's fine to use an in-memory implementation).
Instantiate a default collector with all of its needed callbacks.
Invoke the method AddUrl on the above-defined queue variable with the appropriate URL to query.
Invoke the Run method that sends the actual requests to the target URLs and waits for the responses.
Note that I simplified the solution you shared just to focus on the number of requests in the two approaches. I didn't check the logic you wrote in the OnHTML callback but I assumed it worked.
Let me know if this solves your issue or share how you were able to solve this problem, thanks!

Related

Why don't the GO routines start?

I wrote a simple GO program that should start 3 GO routines. However, the GO routines don't start.
Please note that the situation is not identical to the one describer within this post :
Why is my goroutine not executed?
The program should wait for the GO routines to end their execution... Thus, the program should wait forever (since the routines never stop).
package main
import (
"fmt"
"net"
"os"
"time"
"sync"
)
func main() {
wg := sync.WaitGroup{}
fmt.Print("Starting 3 clients\n")
for i:=0; i<3; i++ {
client := func(inName string) {
fmt.Printf("Client <%s> started\n", inName)
wg.Add(1)
conn, err := net.Dial("tcp", ":8000")
if err != nil {
fmt.Printf("[%s] Error while connecting to the server: %s", inName, err.Error())
os.Exit(1)
}
n := 0
sleepDuration, _ := time.ParseDuration("2s")
for {
message := fmt.Sprintf("[%s] > message %d\n", inName, n)
fmt.Printf("%s", message)
_, err := conn.Write([]byte(message))
if nil != err {
fmt.Sprintf("[%s] Error while writing data to the socket: %s", inName, err.Error())
os.Exit(1)
}
time.Sleep(sleepDuration)
n++
}
}
name := fmt.Sprintf("Client%d", i)
fmt.Printf("Starting client <%s>...\n", name)
go client(name)
fmt.Print("Done\n")
}
wg.Wait()
}
Result:
Starting 3 clients
Starting client <Client0>...
Done
Starting client <Client1>...
Done
Starting client <Client2>...
Done

As this solution says:
It is important that the wg.Add() happens before the go statement to
prevent race conditions. The following would also be correct:
So, you need to take out wg.Add(1) off inside of go routine itself and call it just before starting go routines. Also, call defer wg.Done() at the start of the function, so that, it will decrements the WaitGroup counter by one. Take a look at the code below.
package main
import (
"fmt"
"net"
"os"
"time"
"sync"
)
func main() {
wg := sync.WaitGroup{}
fmt.Print("Starting 3 clients\n")
for i:=0; i<3; i++ {
client := func(inName string) {
defer wg.Done() // added this line
fmt.Printf("Client <%s> started\n", inName)
conn, err := net.Dial("tcp", ":8000")
if err != nil {
fmt.Printf("[%s] Error while connecting to the server: %s", inName, err.Error())
os.Exit(1)
}
n := 0
sleepDuration, _ := time.ParseDuration("2s")
for {
message := fmt.Sprintf("[%s] > message %d\n", inName, n)
fmt.Printf("%s", message)
_, err := conn.Write([]byte(message))
if nil != err {
fmt.Sprintf("[%s] Error while writing data to the socket: %s", inName, err.Error())
os.Exit(1)
}
time.Sleep(sleepDuration)
n++
}
}
name := fmt.Sprintf("Client%d", i)
fmt.Printf("Starting client <%s>...\n", name)
wg.Add(1) // moved this line
go client(name)
fmt.Print("Done\n")
}
wg.Wait()
}

go routines hang on download of large files

I am attempting to write an application that will download a range of images.
130 116kb images (works)
50 500kb images (works)
130 500kb images (eventually hangs)
230 116kb images (eventually hangs)
go version go1.9.2 darwin/amd64
package main
import (
"fmt"
"io"
"log"
"net/http"
"os"
"sync"
)
func main() {
var urls []string
// var smallImage = "https://s3.amazonaws.com/golangplayground/116kb.jpeg" //116kb
var largeImage = "https://s3.amazonaws.com/golangplayground/SampleJPGImage_500kbmb.jpg" //500kb
for i := 0; i < 130; i++ {
urls = append(urls, largeImage)
}
var wg sync.WaitGroup
wg.Add(len(urls))
var inc = 0
for _, val := range urls {
inc += 1
go saveResourceFromURLToDisk(val, "./foo", &wg, inc)
}
wg.Wait()
fmt.Println("done.")
}
func saveResourceFromURLToDisk(url string, writeTo string, wg *sync.WaitGroup, inc int) error {
defer wg.Done()
response, err := http.Get(url)
if err != nil {
log.Fatal(err)
return err
}
defer response.Body.Close()
localPath := fmt.Sprintf("%s/%d", writeTo, inc)
file, err := os.Create(localPath)
if err != nil {
log.Fatal(err)
return err
}
defer file.Close()
_, err = io.Copy(file, response.Body)
if err != nil {
log.Fatal(err)
return err
}
fmt.Println(localPath)
return nil
}

This is probably a network problem. There's a reason web browsers have limits on how many sessions to open to the same server.
If you open a bunch of TCP sessions all at the same time, almost all of them will lose packets. Then they'll all try to retry at about the same time, losing more packets. It is just a big pile of lose.
Place a small delay between opening each GET request or limit yourself to 4 - 8 simultaneous downloads from the same server.

I found the answer with Zan's help by bucketing my go routines in 5 requests each... This way I get to take advantage of some parallelism while throttling the amount of open connections I am creating.
Its a bit naive and I am wondering if anyone has a more elegant solution.
package main
import (
"fmt"
"io"
"log"
"net/http"
"os"
"sync"
)
func main() {
var urls []string
// var smallImage = "https://s3.amazonaws.com/golangplayground/116kb.jpeg" //116kb
var largeImage = "https://s3.amazonaws.com/golangplayground/SampleJPGImage_500kbmb.jpg" //500kb
for i := 0; i < 150; i++ {
urls = append(urls, largeImage)
}
var inc = 0;
for x:=0; x < len(urls)/5; x++ {
var wg sync.WaitGroup
for y:=0; y<5; y++ {
wg.Add(1)
go saveResourceFromURLToDisk(urls[x*y], "./foo", &wg, inc)
inc += 1
}
wg.Wait()
}
fmt.Println("done.")
}
func saveResourceFromURLToDisk(url string, writeTo string, wg *sync.WaitGroup, inc int) error {
defer wg.Done()
response, err := http.Get(url)
if err != nil {
log.Fatal(err)
return err
}
defer response.Body.Close()
localPath := fmt.Sprintf("%s/%d", writeTo, inc)
file, err := os.Create(localPath)
if err != nil {
log.Fatal(err)
return err
}
defer file.Close()
_, err = io.Copy(file, response.Body)
if err != nil {
log.Fatal(err)
return err
}
fmt.Println(localPath)
return nil
}

Confusion regarding channel directions and blocking in Go

In a function definition, if a channel is an argument without a direction, does it have to send or receive something?
func makeRequest(url string, ch chan<- string, results chan<- string) {
start := time.Now()
resp, err := http.Get(url)
defer resp.Body.Close()
if err != nil {
fmt.Printf("%v", err)
}
resp, err = http.Post(url, "text/plain", bytes.NewBuffer([]byte("Hey")))
defer resp.Body.Close()
secs := time.Since(start).Seconds()
if err != nil {
fmt.Printf("%v", err)
}
// Cannot move past this.
ch <- fmt.Sprintf("%f", secs)
results <- <- ch
}
func MakeRequestHelper(url string, ch chan string, results chan string, iterations int) {
for i := 0; i < iterations; i++ {
makeRequest(url, ch, results)
}
for i := 0; i < iterations; i++ {
fmt.Println(<-ch)
}
}
func main() {
args := os.Args[1:]
threadString := args[0]
iterationString := args[1]
url := args[2]
threads, err := strconv.Atoi(threadString)
if err != nil {
fmt.Printf("%v", err)
}
iterations, err := strconv.Atoi(iterationString)
if err != nil {
fmt.Printf("%v", err)
}
channels := make([]chan string, 100)
for i := range channels {
channels[i] = make(chan string)
}
// results aggregate all the things received by channels in all goroutines
results := make(chan string, iterations*threads)
for i := 0; i < threads; i++ {
go MakeRequestHelper(url, channels[i], results, iterations)
}
resultSlice := make([]string, threads*iterations)
for i := 0; i < threads*iterations; i++ {
resultSlice[i] = <-results
}
}
In the above code,
ch <- or <-results
seems to be blocking every goroutine that executes makeRequest.
I am new to concurrency model of Go. I understand that sending to and receiving from a channel blocks but find it difficult what is blocking what in this code.

I'm not really sure that you are doing... It seems really convoluted. I suggest you read up on how to use channels.
https://tour.golang.org/concurrency/2
That being said you have so much going on in your code that it was much easier to just gut it to something a bit simpler. (It can be simplified further). I left comments to understand the code.
package main
import (
"fmt"
"io/ioutil"
"log"
"net/http"
"sync"
"time"
)
// using structs is a nice way to organize your code
type Worker struct {
wg sync.WaitGroup
semaphore chan struct{}
result chan Result
client http.Client
}
// group returns so that you don't have to send to many channels
type Result struct {
duration float64
results string
}
// closing your channels will stop the for loop in main
func (w *Worker) Close() {
close(w.semaphore)
close(w.result)
}
func (w *Worker) MakeRequest(url string) {
// a semaphore is a simple way to rate limit the amount of goroutines running at any single point of time
// google them, Go uses them often
w.semaphore <- struct{}{}
defer func() {
w.wg.Done()
<-w.semaphore
}()
start := time.Now()
resp, err := w.client.Get(url)
if err != nil {
log.Println("error", err)
return
}
defer resp.Body.Close()
// don't have any examples where I need to also POST anything but the point should be made
// resp, err = http.Post(url, "text/plain", bytes.NewBuffer([]byte("Hey")))
// if err != nil {
// log.Println("error", err)
// return
// }
// defer resp.Body.Close()
secs := time.Since(start).Seconds()
b, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Println("error", err)
return
}
w.result <- Result{duration: secs, results: string(b)}
}
func main() {
urls := []string{"https://facebook.com/", "https://twitter.com/", "https://google.com/", "https://youtube.com/", "https://linkedin.com/", "https://wordpress.org/",
"https://instagram.com/", "https://pinterest.com/", "https://wikipedia.org/", "https://wordpress.com/", "https://blogspot.com/", "https://apple.com/",
}
workerNumber := 5
worker := Worker{
semaphore: make(chan struct{}, workerNumber),
result: make(chan Result),
client: http.Client{Timeout: 5 * time.Second},
}
// use sync groups to allow your code to wait for
// all your goroutines to finish
for _, url := range urls {
worker.wg.Add(1)
go worker.MakeRequest(url)
}
// by declaring wait and close as a seperate goroutine
// I can get to the for loop below and iterate on the results
// in a non blocking fashion
go func() {
worker.wg.Wait()
worker.Close()
}()
// do something with the results channel
for res := range worker.result {
fmt.Printf("Request took %2.f seconds.\nResults: %s\n\n", res.duration, res.results)
}
}

The channels in channels are nil (no make is executed; you make the slice but not the channels), so any send or receive will block. I'm not sure exactly what you're trying to do here, but that's the basic problem.
See https://golang.org/doc/effective_go.html#channels for an explanation of how channels work.

How to close a channel

I try to adapt this example:
https://gobyexample.com/worker-pools
But I don't know how to stop the channel because program don't exit at the end of the channel loop.
Can you explain how to exit the program?
package main
import (
"github.com/SlyMarbo/rss"
"bufio"
"fmt"
"log"
"os"
)
func readLines(path string) ([]string, error) {
file, err := os.Open(path)
if err != nil {
return nil, err
}
defer file.Close()
var lines []string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
lines = append(lines, scanner.Text())
}
return lines, scanner.Err()
}
func worker(id int, jobs <-chan string, results chan<- string) {
for url := range jobs {
fmt.Println("worker", id, "processing job", url)
feed, err := rss.Fetch(url)
if err != nil {
fmt.Println("Error on: ", url)
continue
}
borne := 0
for _, value := range feed.Items {
if borne < 5 {
results <- value.Link
borne = borne +1
} else {
continue
}
}
}
}
func main() {
jobs := make(chan string)
results := make(chan string)
for w := 1; w <= 16; w++ {
go worker(w, jobs, results)
}
urls, err := readLines("flux.txt")
if err != nil {
log.Fatalf("readLines: %s", err)
}
for _, url := range urls {
jobs <- url
}
close(jobs)
// it seems program runs over...
for msg := range results {
fmt.Println(msg)
}
}
The flux.txt is a flat text file like :
http://blog.case.edu/news/feed.atom
...

The problem is that, in the example you are referring to, the worker pool reads from results 9 times:
for a := 1; a <= 9; a++ {
<-results
}
Your program, on the other hand, does a range loop over the results which has a different semantics in go. The range operator does not stop until the channel is closed.
for msg := range results {
fmt.Println(msg)
}
To fix your problem you'd need to close the results channel. However, if you just call close(results) before the for loop, you most probably will
get a panic, because the workers might be writing on results.
To fix this problem, you need to add another channel to be notified when all the workers are done. You can do this either using a sync.WaitGroup or :
const (
workers = 16
)
func main() {
jobs := make(chan string, 100)
results := make(chan string, 100)
var wg sync.WaitGroup
for w := 0; w < workers; w++ {
go func() {
wg.Add(1)
defer wg.Done()
worker(w, jobs, results)
}()
}
urls, err := readLines("flux.txt")
if err != nil {
log.Fatalf("readLines: %s", err)
}
for _, url := range urls {
jobs <- url
}
close(jobs)
wg.Wait()
close(results)
// it seems program runs over...
for msg := range results {
fmt.Println(msg)
}
}
Or a done channel:
package main
import (
"bufio"
"fmt"
"github.com/SlyMarbo/rss"
"log"
"os"
)
func readLines(path string) ([]string, error) {
file, err := os.Open(path)
if err != nil {
return nil, err
}
defer file.Close()
var lines []string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
lines = append(lines, scanner.Text())
}
return lines, scanner.Err()
}
func worker(id int, jobs <-chan string, results chan<- string, done chan struct{}) {
for url := range jobs {
fmt.Println("worker", id, "processing job", url)
feed, err := rss.Fetch(url)
if err != nil {
fmt.Println("Error on: ", url)
continue
}
borne := 0
for _, value := range feed.Items {
if borne < 5 {
results <- value.Link
borne = borne + 1
} else {
continue
}
}
}
close(done)
}
const (
workers = 16
)
func main() {
jobs := make(chan string, 100)
results := make(chan string, 100)
dones := make([]chan struct{}, workers)
for w := 0; w < workers; w++ {
dones[w] = make(chan struct{})
go worker(w, jobs, results, dones[w])
}
urls, err := readLines("flux.txt")
if err != nil {
log.Fatalf("readLines: %s", err)
}
for _, url := range urls {
jobs <- url
}
close(jobs)
for _, done := range dones {
<-done
}
close(results)
// it seems program runs over...
for msg := range results {
fmt.Println(msg)
}
}

Any better way to keep track of goroutine responses?

I'm trying to get my head around goroutines. I've created a simple program that performs the same search in parallel across multiple search engines. At the moment to keep track of the number of responses, I count the number I've received. It seems a bit amateur though.
Is there a better way of knowing when I've received a response from all of the goroutines in the following code?
package main
import (
"fmt"
"net/http"
"log"
)
type Query struct {
url string
status string
}
func search (url string, out chan Query) {
fmt.Printf("Fetching URL %s\n", url)
resp, err := http.Get(url)
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
out <- Query{url, resp.Status}
}
func main() {
searchTerm := "carrot"
fmt.Println("Hello world! Searching for ", searchTerm)
searchEngines := []string{
"http://www.bing.co.uk/?q=",
"http://www.google.co.uk/?q=",
"http://www.yahoo.co.uk/?q="}
out := make(chan Query)
for i := 0; i < len(searchEngines); i++ {
go search(searchEngines[i] + searchTerm, out)
}
progress := 0
for {
// is there a better way of doing this step?
if progress >= len(searchEngines) {
break
}
fmt.Println("Polling...")
query := <-out
fmt.Printf("Status from %s was %s\n", query.url, query.status)
progress++
}
}

Please use sync.WaitGroup, there is an example in the pkg doc
searchEngines := []string{
"http://www.bing.co.uk/?q=",
"http://www.google.co.uk/?q=",
"http://www.yahoo.co.uk/?q="}
var wg sync.WaitGroup
out := make(chan Query)
for i := 0; i < len(searchEngines); i++ {
wg.Add(1)
go func (url string) {
defer wg.Done()
fmt.Printf("Fetching URL %s\n", url)
resp, err := http.Get(url)
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
out <- Query{url, resp.Status}
}(searchEngines[i] + searchTerm)
}
wg.Wait()

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Go Colly parallelism decreases the number of links scraped - go

Related

Why don't the GO routines start?

go routines hang on download of large files

Confusion regarding channel directions and blocking in Go

How to close a channel

Any better way to keep track of goroutine responses?

Categories

Resources