How to add list of url to gocolly Queue? - go

I want to scrape a list of url to using gocolly
func main() {
fileName := "output.txt"
var result string
f, err := os.Create(fileName)
if err != nil {
panic(err)
}
defer func() {
if err := f.Close(); err != nil {
panic(err)
}
}()
rows := ReadInput()
q := AddUrl(rows)
// Instantiate default collector
c := colly.NewCollector()
c.OnHTML("body", func(e *colly.HTMLElement) {
result = result + e.Text +"\n"
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("visiting", r.URL)
})
// Set error handler
c.OnError(func(r *colly.Response, err error) {
fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
})
q.Run(c)
f.WriteString(result)
log.Printf("Scraping done, Please check file %q for results\n", fileName)
}
func ReadInput() []string{
// Read from file
b, err := ioutil.ReadFile("input.txt") // just pass the file name
if err != nil {
fmt.Print(err)
}
str := string(b) // convert content to a 'string'
// split each row
rows := strings.Split(str,"\n")
return rows
}
But when I am trying to add url from slice of string(url) to gocolly queue it doesn't add all url, just added the last url.
func AddUrl(rows []string) *queue.Queue {
Q, _ := queue.New(
2, // Number of consumer threads
&queue.InMemoryQueueStorage{MaxSize: 10000},
)
for _,url:=range rows{
Q.AddURL(url)
}
return Q
}
instead of loop if I add url maually then it's work perfectly, but with loop it just add the last element.
func AddUrl(rows []string) *queue.Queue {
Q, _ := queue.New(
2, // Number of consumer threads
&queue.InMemoryQueueStorage{MaxSize: 10000},
)
Q.AddURL("http://bakeshopva.com")
Q.AddURL("http://zekescoffeedc.com")
return Q
}

Related

Golang Chromedp: pdf file download without saving in server

How to chromedp pdf download without saving in server?
Below code is working for generating pdf file and saving in server side. But I want to download pdf file without saving in server side.
func PDFInvoice(c *gin.Context) {
session := sessions.Default(c)
id := c.Params.ByName("id")
token := session.Get("login_session").(string)
// create context
ctx, cancel := chromedp.NewContext(context.Background())
defer cancel()
// capture pdf
var buf []byte
url := "http://localhost:8080/invoice/" + id + "/" + token
if err := chromedp.Run(ctx, printToPDF(url, &buf)); err != nil {
log.Fatal(err)
}
buff := new(bytes.Buffer)
if _, err := buff.WriteTo(c.Writer); err != nil {
panic(err)
}
if err := os.WriteFile("sample.pdf", buf, 0o644); err != nil {
log.Fatal(err)
}
//ioutil.WriteFile("sample.pdf", buf, 0644)
c.JSON(200, id+" "+token)
}
// print a specific pdf page.
func printToPDF(urlstr string, res *[]byte) chromedp.Tasks {
return chromedp.Tasks{
chromedp.Navigate(urlstr),
chromedp.ActionFunc(func(ctx context.Context) error {
buf, _, err := page.PrintToPDF().WithPrintBackground(false).Do(ctx)
if err != nil {
return err
}
*res = buf
return nil
}),
}
}
You can write the bytes to http.ResponseWriter directly. See the demo below:
package main
import (
"context"
"log"
"net/http"
"sync"
"github.com/chromedp/cdproto/page"
"github.com/chromedp/chromedp"
)
func main() {
http.Handle("/pdf", http.HandlerFunc(servePDF))
log.Fatal(http.ListenAndServe(":8080", http.DefaultServeMux))
}
func servePDF(w http.ResponseWriter, r *http.Request) {
buf, err := createPDF()
if err != nil {
log.Fatalln(err)
w.WriteHeader(http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/pdf")
w.Write(buf)
}
func createPDF() ([]byte, error) {
ctx, cancel := newTabContext()
defer cancel()
html := `<html>
<body>
<div>text</div>
<img src="https://pkg.go.dev/static/shared/gopher/package-search-700x300.jpeg"/>
<img src="https://go.dev/images/gophers/motorcycle.svg"/>
<img src="https://go.dev/images/go_google_case_study_carousel.png" />
</body>
</html>`
var buf []byte
if err := chromedp.Run(ctx,
chromedp.Navigate("about:blank"),
// set the page content and wait until the page is loaded (including its resources).
chromedp.ActionFunc(func(ctx context.Context) error {
lctx, cancel := context.WithCancel(ctx)
defer cancel()
var wg sync.WaitGroup
wg.Add(1)
chromedp.ListenTarget(lctx, func(ev interface{}) {
if _, ok := ev.(*page.EventLoadEventFired); ok {
// It's a good habit to remove the event listener if we don't need it anymore.
cancel()
wg.Done()
}
})
frameTree, err := page.GetFrameTree().Do(ctx)
if err != nil {
return err
}
if err := page.SetDocumentContent(frameTree.Frame.ID, html).Do(ctx); err != nil {
return err
}
wg.Wait()
return nil
}),
chromedp.ActionFunc(func(ctx context.Context) error {
var err error
buf, _, err = page.PrintToPDF().WithPrintBackground(false).Do(ctx)
if err != nil {
return err
}
return nil
}),
); err != nil {
return nil, err
}
return buf, nil
}
var (
browserCtx context.Context
once sync.Once
)
// newTabContext creates a tab context with the global browser context as its parent context.
//
// When tasks is run with the returned context, a new tab will be created in the browser.
func newTabContext() (context.Context, context.CancelFunc) {
once.Do(func() { initBrowser() })
if browserCtx == nil || browserCtx.Err() != nil {
log.Fatalf("browser is not available: %v", browserCtx.Err())
}
return chromedp.NewContext(browserCtx)
}
// initBrowser starts a browser in which to create new tab for running tasks.
func initBrowser() {
browserCtx, _ = chromedp.NewContext(context.Background())
// to start the browser
if err := chromedp.Run(browserCtx); err != nil {
log.Fatal(err)
}
}
Usage:
go run main.go
curl http://localhost:8080/pdf > sample.pdf
References:
https://github.com/chromedp/chromedp/issues/941
https://github.com/chromedp/chromedp/issues/836

Inadvertent multiple returns

I'm building an application where I get muslim prayer data from multiple sources. The first being S3, the second being aladhan (a public api). I only want to get data from aladhan if it's not available in S3. If I do have to get the data from the public source then I upload it to my s3.
Here is the code:
This is my interface loop code. I've put in print statements to show that I'm running into the return statement twice, once with data in my return struct, the second time the struct is nil.
// prayeriface.go
package prayer
import (
"fmt"
)
type MonthPrayerIface interface {
GetMonthPrayer(input *LookupInput) (*PreCurrNextMonthPrayer, error)
}
type PreCurrNextMonthPrayer struct {
custData *LookupInput
CurrentMonthData *PCal
PreviousMonthData *PCal
NextMonthData *PCal
prayers []MonthPrayerIface
}
func (p *PreCurrNextMonthPrayer) GetMonthPrayers() (*PreCurrNextMonthPrayer, error) {
var err error
var monthlyData *PreCurrNextMonthPrayer
defer func() {
fmt.Printf("return monthlyData address & value = %p %v\n", monthlyData, monthlyData)
}()
for k, data := range p.prayers {
fmt.Printf("loop = %v, data= %T %v\n", k, monthlyData, monthlyData)
monthlyData, err = data.GetMonthPrayer(p.custData)
fmt.Printf("\terr= %v\n", err)
fmt.Printf("\tmonthlyData= %p %v\n", monthlyData, monthlyData)
if err == nil {
fmt.Printf("loop-return: err == nil \n")
return monthlyData, nil
}
}
if err == nil {
fmt.Printf("post-loop:\n")
fmt.Printf("\tmonthlyData= %p %v\n", monthlyData, monthlyData)
return monthlyData, nil
}
return nil, fmt.Errorf("unable to get prayer data from all sources %s", err)
}
func NewMonthPrayer(input *LookupInput, prayers ...MonthPrayerIface) (*PreCurrNextMonthPrayer, error) {
var err error
t := &PreCurrNextMonthPrayer{
custData: input,
prayers: prayers,
}
t, err = t.GetMonthPrayers()
if err != nil {
return nil, err
}
return t, nil
}
As you can see, I'm looping over an interface struct method called GetMonthPrayer
This is my s3 source
// s3.go
package prayer
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/s3"
"io"
awsservices "prayer-times/src/aws"
)
// S3Store s3 storage object for prayer calendars
type S3Store struct {
data *PCal
}
// GetMonthPrayer retrieves prayer data from s3, otherwise from aladhan
func (s *S3Store) GetMonthPrayer(input *LookupInput) (*PreCurrNextMonthPrayer, error) {
mPrayer := new(PreCurrNextMonthPrayer)
fmt.Println("attempting to retrieve prayer data from s3")
s3Client := awsservices.NewS3Service()
pMonthInput := &LookupInput{
Country: input.Country,
ZipCode: input.ZipCode,
custTime: input.custTime.AddDate(0, -1, 0),
}
nMonthInput := &LookupInput{
Country: input.Country,
ZipCode: input.ZipCode,
custTime: input.custTime.AddDate(0, 1, 0),
}
// s3Pdata retrieves data from S3 and
s3pData := func(input *LookupInput) (*PCal, error) {
pCalendar := new(PCal)
data, err := s3Client.GetObject(&s3.GetObjectInput{
Bucket: aws.String(bucket),
Key: aws.String(
fmt.Sprintf(
"%s/%d/%d/%d",
input.Country,
input.ZipCode,
input.custTime.Year(),
input.custTime.Month())),
})
if err != nil {
return nil, err
}
if data == nil {
return nil, errors.New("error data from s3 is nil")
}
defer func() {
err := data.Body.Close()
if err != nil {
fmt.Printf("unable to close s3 body: %s", err)
}
}()
s3buf := bytes.NewBuffer(nil)
if _, err := io.Copy(s3buf, data.Body); err != nil {
return nil, err
}
dataBytes := s3buf.Bytes()
decoder := json.NewDecoder(bytes.NewReader(dataBytes))
err = decoder.Decode(&pCalendar)
if err != nil {
fmt.Printf("unable to decode json: %s", err)
}
return pCalendar, nil
}
aladhanData := new(AladhanStore)
getAladhanData := func(input *LookupInput) (*PreCurrNextMonthPrayer, error) {
data, err := aladhanData.GetMonthPrayer(input)
if err != nil {
return nil, err
}
return data, nil
}
// Get current data from s3, if not s3, then get all three from aladhan
cMonthS3Data, err := s3pData(input)
pMonthS3Data, err := s3pData(pMonthInput)
nMonthS3Data, err := s3pData(nMonthInput)
if err != nil {
adata, err := getAladhanData(input)
if err != nil {
fmt.Printf("err: %s", err)
return nil, err
}
return adata, nil
}
mPrayer.CurrentMonthData = cMonthS3Data
// Get previous month data from s3, if not s3, then get all three from aladhan
mPrayer.PreviousMonthData = pMonthS3Data
// Get next month data from s3, if not s3, then get all three from aladhan
mPrayer.NextMonthData = nMonthS3Data
return mPrayer, nil
}
Here is my aladhan source
// aladhan.go
package prayer
import (
"bytes"
"encoding/json"
"fmt"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/s3/s3manager"
"golang.org/x/sync/errgroup"
"io"
"log"
awsservices "prayer-times/src/aws"
"prayer-times/src/urljsonoutput"
"prayer-times/src/zipcoordinates"
)
var (
aladhanURL string = "https://api.aladhan.com/v1/calendar?"
)
// PCal contains the prayer times of the month as well as the return code
type PCal struct {
Code int `json:"code"`
Status string `json:"status"`
Data []struct {
Timings DailyPrayers
}
}
/*
AladhanData returns the total monthly prayers of given month, coordinates, and zip from aladhan.
https://api.aladhan.com/v1/calendar?latitude=51.508515&longitude=-0.1254872&method=1&month=4&year=2017
*/
func AladhanData(input *LookupInput) *PCal {
coordinates := zipcoordinates.HereCoordinates(&zipcoordinates.GeoLocationInput{
PostalCode: input.ZipCode,
CountryCode: input.Country,
})
respStruct := new(PCal)
_, err := urljsonoutput.GetURLJSON(fmt.Sprintf(
"%slatitude=%v&longitude=%v&method=%v&month=%v&year=%v",
aladhanURL,
coordinates.Items[0].Position.Latitude,
coordinates.Items[0].Position.Longitude,
input.Method,
int(input.custTime.Month()),
input.custTime.Year()), respStruct)
if err != nil {
log.Fatalf("unable to pull monthly prayer data %v", err)
}
return respStruct
}
// AladhanStore struct to interact with interface for GetMonthPrayer
type AladhanStore struct {
data *PCal
}
// GetMonthPrayer Pulls prayer data from aladhan
func (a *AladhanStore) GetMonthPrayer(input *LookupInput) (*PreCurrNextMonthPrayer, error) {
mPrayer := new(PreCurrNextMonthPrayer)
// Return prayer data from aladhan
custPMonthTime := input.custTime.AddDate(0, -1, 0)
pMonthLookupInput := new(LookupInput)
pMonthLookupInput.custTime = custPMonthTime
pMonthLookupInput.ZipCode = input.ZipCode
pMonthLookupInput.Country = input.Country
custNMonthTime := input.custTime.AddDate(0, 1, 0)
nMonthLookupInput := new(LookupInput)
nMonthLookupInput.custTime = custNMonthTime
nMonthLookupInput.ZipCode = input.ZipCode
nMonthLookupInput.Country = input.Country
prayerData := AladhanData(input)
pMonthPData := AladhanData(pMonthLookupInput)
nMonthPData := AladhanData(nMonthLookupInput)
// Save prayer data into io.Reader to save to s3
var Marshal = func(data interface{}) (io.ReadSeeker, error) {
mdata, err := json.MarshalIndent(data, "", "\t")
if err != nil {
return nil, err
}
return bytes.NewReader(mdata), nil
}
rmData, err := Marshal(prayerData)
pRmData, err := Marshal(pMonthPData)
nRmData, err := Marshal(nMonthPData)
if err != nil {
return nil, err
}
// Save prayer data into s3
g := new(errgroup.Group)
s3Upload := func(rawData *io.ReadSeeker, input *LookupInput) func() error {
return func() error {
s3Client := s3manager.NewUploaderWithClient(awsservices.NewS3Service())
_, err = s3Client.Upload(&s3manager.UploadInput{
Bucket: aws.String(bucket),
Key: aws.String(
fmt.Sprintf(
"%s/%d/%d/%d",
input.Country,
input.ZipCode,
input.custTime.Year(),
int(input.custTime.Month()))),
Body: *rawData,
})
if err != nil {
return err
}
return nil
}
}
g.Go(s3Upload(&pRmData, pMonthLookupInput))
g.Go(s3Upload(&rmData, input))
g.Go(s3Upload(&nRmData, nMonthLookupInput))
if err := g.Wait(); err == nil {
mPrayer.PreviousMonthData = pMonthPData
mPrayer.CurrentMonthData = prayerData
mPrayer.NextMonthData = nMonthPData
return mPrayer, nil
}
return nil, err
}
Here is my test file.
func TestPrayer(t *testing.T) {
p, err := NewMonthPrayer(
&input,
&S3Store{},
&AladhanStore{},
)
if err != nil {
t.Errorf("error: %s", err)
}
data, err := p.GetMonthPrayers()
if err != nil {
t.Errorf("error: %s", err)
}
t.Logf("Test address: %p", data)
t.Logf("data THIS SHOULDN'T BE NIL: %v", data)
t.Logf("ERROR: %s", err)
}
These are my results. Ignore the pass result, the data is first not nil and second nil.
=== RUN TestPrayer
loop = 0, data= *prayer.PreCurrNextMonthPrayer <nil>
attempting to retrieve prayer data from s3
err= <nil>
monthlyData= 0xc000131180 &{<nil> 0xc0002612f0 0xc00051e780 0xc00011cea0 []}
loop-return: err == nil
return monthlyData address & value = 0xc000131180 &{<nil> 0xc0002612f0 0xc00051e780 0xc00011cea0 []}
post-loop:
monthlyData= 0x0 <nil>
return monthlyData address & value = 0x0 <nil>
prayer_test.go:53: Test address: 0x0
prayer_test.go:55: data THIS SHOULDN'T BE NIL: <nil>
prayer_test.go:56: ERROR: %!s(<nil>)
--- PASS: TestPrayer (0.32s)
PASS
The duplicate was due to the GetMonthPrayer call from NewMonthPrayer, which shouldn't have been the case to begin with. It was called first but returned second, thus overwriting the existing data.
func NewMonthPrayer(input *LookupInput, prayers ...MonthPrayerIface) (*PreCurrNextMonthPrayer, error) {
var err error
t := &PreCurrNextMonthPrayer{
custData: input,
prayers: prayers,
}
t, err = t.GetMonthPrayers()
if err != nil {
return nil, err
}
return t, nil
}
I removed the NewMonthPrayer entirely as it was unnecessary, I also removed the function call in the process, thus fixing the initial problem.
// NewPrayer instantiates a prayer type object with the required input
func NewPrayer(input *LookupInput, prayers ...MonthPrayerIface) *Prayer {
return &Prayer{
custData: input,
prayers: prayers,
}
}

Go is not writing complete data to text file

I am trying to explore Go concurrency. Here Grabber() prints and writes the result of the execution. The program prints the expected result, but does not write it to urls.txt. Can anyone explain to me what i am missing here?
main.go
package main
import (
"bufio"
"fmt"
"io/ioutil"
"log"
"net/http"
"os"
"regexp"
"strings"
"sync"
)
var wg sync.WaitGroup
var mt sync.Mutex
// Final Literation
func main() {
file, err := os.Open("ip.txt")
if err != nil {
log.Fatal(err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
go Grabber(scanner.Text())
wg.Add(1)
}
wg.Wait()
if err := scanner.Err(); err != nil {
log.Fatal(err)
}
}
// stringInArray do If string in list return true false otherwise.
func stringInArray(a string, list []string) bool {
for _, b := range list {
if b == a {
return true
}
}
return false
}
// Grabber Do Search the bing and collect array of sitelist
func Grabber(ip string) {
defer wg.Done()
var output []string
outfile, err := os.Create("urls.txt")
if err != nil {
log.Fatal(err)
}
defer outfile.Close()
if ip == "" {
}
page := 1
for page < 251 {
client := &http.Client{}
req, err := http.NewRequest(
http.MethodGet,
fmt.Sprintf(
"http://www.bing.com/search?q=ip:%s+&count=50&first=1",
ip,
),
nil,
)
if err != nil {
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:57.0) Gecko/20100101 Firefox/57.0")
res, err := client.Do(req)
if err != nil {
fmt.Println("Invalid Request")
}
defer res.Body.Close()
body, err := ioutil.ReadAll(res.Body)
if err != nil {
fmt.Println("Couldn't Read")
}
re := regexp.MustCompile(`<h2><a href="(.*?)"`)
links := re.FindAllString(string(body), -1)
if links != nil {
for l := range links {
o := strings.Split(links[l], `"`)
d := strings.Split(o[1], "/")
s := d[0] + "//" + d[2]
if !stringInArray(s, output) {
output = append(output, s)
}
}
}
page = page + 50
}
for _, links := range output {
fmt.Println(links)
fmt.Fprintln(outfile, links)
}
}
Ip.txt as input
103.253.145.129
103.253.146.125
103.253.146.239
103.253.147.72
146.185.176.79
146.185.176.45
146.185.179.250
146.185.180.35
146.185.180.185
146.185.180.113
146.185.181.51
146.185.183.107
146.185.183.202
146.185.183.248
146.185.183.219
146.185.184.69
146.185.185.169
git repo URLGrabber
You are calling create in each goroutine, which will truncate the file. Instead, create the file outside, and serialize the writes to it using another goroutine:
outfile, err := os.Create("urls.txt")
results:=make(chan []string)
go func() {
for output:=range results {
for _, links := range output {
fmt.Fprintln(outfile, links)
}
}
}()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
go Grabber(scanner.Text(), results)
wg.Add(1)
}
wg.Wait()
close(results)
When you get the results in Grabber, instead of writing it to the file, write it to the channel:
results<-output
for _, links := range output {
fmt.Println(links)
}

I'm encountering multiple errors when trying concurrently parse sites with GO

Following some #Sam Whited advance and doing some research on stack, i've rewritten my code see below: This version of the code seems more stable, however, it is having issues where every once and a while the i get a slew of TCP errors as if i'm no closing my requests. I've throttled the requests by adding a sleep. It seems to help a bit.
func main() {
runtime.GOMAXPROCS(maxParallelism())
var file = flag.String("f", "", "Enter new line deliminated text file")
var fileName = flag.String("s", "contact_bot.csv", "Enter new line deliminated text file")
flag.Parse()
if *file != "" {
counter := 0
filters = []string{"info", "ads", "sales", "sale", "info", "media", "mediarelations", "media_relations", "contact", "contacts", "contactus", "contact_us", "contact-us", "about_us", "general", "advertise", "support", "systems", "system"}
emailRE = regexp.MustCompile(`([a-z0-9!#$%&'*+\/=?^_{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_{|}~-]+)*(#|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)`)
seedUrls, err := readLines(*file)
checkErr(err)
numberOfUrls := len(seedUrls)
usr, err := user.Current()
checkErr(err)
parentPath := filepath.Join(usr.HomeDir, "/Desktop/"+*fileName)
file, err := os.Create(parentPath)
checkErr(err)
defer file.Close()
writer := csv.NewWriter(file)
defer writer.Flush()
var header = []string{"URL", "EMAILS"}
err = writer.Write(header)
checkErr(err)
data = make(chan *HTTPResponse)
go asyncHTTPGets(seedUrls)
loop:
for result := range data {
counter++
emails := findEmails(result.HTML, filters)
fmt.Printf("%s, %s, %s\n", result.URL, emails, strconv.Itoa(numberOfUrls))
var row = []string{result.URL, strings.Join(emails, ",")}
err := writer.Write(row)
// writer.Flush()
checkErr(err)
if counter == len(seedUrls) {
break loop
}
numberOfUrls--
}
}
}
// AsyncHTTPGets ...
func asyncHTTPGets(urls []string) {
counter := 0
for _, url := range urls {
counter++
if counter%10 == 0 {
time.Sleep(1 * time.Second)
}
go func(url string) {
fmt.Printf("Fetching %s \n", url)
resp, err := http.Get(url)
if err != nil {
fmt.Println(err.Error())
data <- &HTTPResponse{url, err.Error()}
return
}
b := resp.Body
buf := new(bytes.Buffer)
buf.ReadFrom(b)
resp.Body.Close()
myHTML := buf.String()
data <- &HTTPResponse{url, myHTML}
}(url)
}
}
func findEmails(html string, filters []string) []string {
emails := emailRE.FindAllString(html, -1)
filteredEmails := []string{}
for _, email := range emails {
if stringInSlice(email, filters) {
if !stringInSlice(email, filteredEmails) {
filteredEmails = append(filteredEmails, email)
}
}
}
sort.Strings(filteredEmails)
return filteredEmails
}
The application will open a large number of sockets and possibly breach file descriptor limits. I suggest limiting the number of concurrent requests to prevent this issue:
var (
requestMu sync.Mutex // protects requestCount
requestCount int // incremented on each request
)
// Create 10 workers. Adjust up or down as needed.
for w := 0; w < 10; w++ {
go func() {
for {
// Increment request count. Exit at end.
requestMu.Lock()
i := requestCount
requestCount++
requestMu.Unlock()
if i >= len(seedUrls) {
return
}
// Fetch the current URL.
myURL := seedUrls[i]
resp, err := http.Get(myUrl)
if err != nil {
fmt.Println(myURL, err.Error(), i)
data <- &HTTPResponse{myURL, err.Error()}
continue
}
// Read body and close.
b, err := ioutil.ReadAll(resp.Body)
resp.Body.Close()
if err != nil {
fmt.Println(myURL, err.Error(), i)
data <- &HTTPResponse{myURL, err.Error()}
continue
}
myHTML := string(b)
data <- &HTTPResponse{myURL, myHTML}
}
}()
}
// Recieve expected number of results
for i := 0; i < len(seedUrls); i++ {
result <- data
emails := findEmails(result.HTML, filters)
fmt.Printf("%s, %s, %d\n", result.URL, emails, i)
var row = []string{result.URL, strings.Join(emails, ",")}
err := writer.Write(row)
writer.Flush()
if err != nil {
panic(err)
}
}

Reading CSV file in Go

Here is a code snippet that reads CSV file:
func parseLocation(file string) (map[string]Point, error) {
f, err := os.Open(file)
defer f.Close()
if err != nil {
return nil, err
}
lines, err := csv.NewReader(f).ReadAll()
if err != nil {
return nil, err
}
locations := make(map[string]Point)
for _, line := range lines {
name := line[0]
lat, laterr := strconv.ParseFloat(line[1], 64)
if laterr != nil {
return nil, laterr
}
lon, lonerr := strconv.ParseFloat(line[2], 64)
if lonerr != nil {
return nil, lonerr
}
locations[name] = Point{lat, lon}
}
return locations, nil
}
Is there a way to improve readability of this code? if and nil noise.
Go now has a csv package for this. Its is encoding/csv. You can find the docs here: https://golang.org/pkg/encoding/csv/
There are a couple of good examples in the docs. Here is a helper method I created to read a csv file and returns its records.
package main
import (
"encoding/csv"
"fmt"
"log"
"os"
)
func readCsvFile(filePath string) [][]string {
f, err := os.Open(filePath)
if err != nil {
log.Fatal("Unable to read input file " + filePath, err)
}
defer f.Close()
csvReader := csv.NewReader(f)
records, err := csvReader.ReadAll()
if err != nil {
log.Fatal("Unable to parse file as CSV for " + filePath, err)
}
return records
}
func main() {
records := readCsvFile("../tasks.csv")
fmt.Println(records)
}
Go is a very verbose language, however you could use something like this:
// predeclare err
func parseLocation(file string) (locations map[string]*Point, err error) {
f, err := os.Open(file)
if err != nil {
return nil, err
}
defer f.Close() // this needs to be after the err check
lines, err := csv.NewReader(f).ReadAll()
if err != nil {
return nil, err
}
//already defined in declaration, no need for :=
locations = make(map[string]*Point, len(lines))
var lat, lon float64 //predeclare lat, lon
for _, line := range lines {
// shorter, cleaner and since we already have lat and err declared, we can do this.
if lat, err = strconv.ParseFloat(line[1], 64); err != nil {
return nil, err
}
if lon, err = strconv.ParseFloat(line[2], 64); err != nil {
return nil, err
}
locations[line[0]] = &Point{lat, lon}
}
return locations, nil
}
//edit
A more efficient and proper version was posted by #Dustin in the comments, I'm adding it here for completeness sake:
func parseLocation(file string) (map[string]*Point, error) {
f, err := os.Open(file)
if err != nil {
return nil, err
}
defer f.Close()
csvr := csv.NewReader(f)
locations := map[string]*Point{}
for {
row, err := csvr.Read()
if err != nil {
if err == io.EOF {
err = nil
}
return locations, err
}
p := &Point{}
if p.lat, err = strconv.ParseFloat(row[1], 64); err != nil {
return nil, err
}
if p.lon, err = strconv.ParseFloat(row[2], 64); err != nil {
return nil, err
}
locations[row[0]] = p
}
}
playground
I basically copied my answer from here: https://www.dotnetperls.com/csv-go. For me, this was a better answer than what I found on stackoverflow.
import (
"bufio"
"encoding/csv"
"os"
"fmt"
"io"
)
func ReadCsvFile(filePath string) {
// Load a csv file.
f, _ := os.Open(filePath)
// Create a new reader.
r := csv.NewReader(f)
for {
record, err := r.Read()
// Stop at EOF.
if err == io.EOF {
break
}
if err != nil {
panic(err)
}
// Display record.
// ... Display record length.
// ... Display all individual elements of the slice.
fmt.Println(record)
fmt.Println(len(record))
for value := range record {
fmt.Printf(" %v\n", record[value])
}
}
}
I also dislike the verbosity of the default Reader, so I made a new type that is
similar to bufio#Scanner:
package main
import "encoding/csv"
import "io"
type Scanner struct {
Reader *csv.Reader
Head map[string]int
Row []string
}
func NewScanner(o io.Reader) Scanner {
csv_o := csv.NewReader(o)
a, e := csv_o.Read()
if e != nil {
return Scanner{}
}
m := map[string]int{}
for n, s := range a {
m[s] = n
}
return Scanner{Reader: csv_o, Head: m}
}
func (o *Scanner) Scan() bool {
a, e := o.Reader.Read()
o.Row = a
return e == nil
}
func (o Scanner) Text(s string) string {
return o.Row[o.Head[s]]
}
Example:
package main
import "strings"
func main() {
s := `Month,Day
January,Sunday
February,Monday`
o := NewScanner(strings.NewReader(s))
for o.Scan() {
println(o.Text("Month"), o.Text("Day"))
}
}
https://golang.org/pkg/encoding/csv
You can also read contents of a directory to load all the CSV files. And then read all those CSV files 1 by 1 with goroutines
csv file:
101,300.00,11000901,1155686400
102,250.99,11000902,1432339200
main.go file:
const sourcePath string = "./source"
func main() {
dir, _ := os.Open(sourcePath)
files, _ := dir.Readdir(-1)
for _, file := range files {
fmt.Println("SINGLE FILE: ")
fmt.Println(file.Name())
filePath := sourcePath + "/" + file.Name()
f, _ := os.Open(filePath)
defer f.Close()
// os.Remove(filePath)
//func
go func(file io.Reader) {
records, _ := csv.NewReader(file).ReadAll()
for _, row := range records {
fmt.Println(row)
}
}(f)
time.Sleep(10 * time.Millisecond)// give some time to GO routines for execute
}
}
And the OUTPUT will be:
$ go run main.go
SINGLE FILE:
batch01.csv
[101 300.00 11000901 1155686400]
[102 250.99 11000902 1432339200]
----------------- -------------- ---------------------- -------
---------------- ------------------- ----------- --------------
Below example with the Invoice struct
func main() {
dir, _ := os.Open(sourcePath)
files, _ := dir.Readdir(-1)
for _, file := range files {
fmt.Println("SINGLE FILE: ")
fmt.Println(file.Name())
filePath := sourcePath + "/" + file.Name()
f, _ := os.Open(filePath)
defer f.Close()
go func(file io.Reader) {
records, _ := csv.NewReader(file).ReadAll()
for _, row := range records {
invoice := new(Invoice)
invoice.InvoiceNumber = row[0]
invoice.Amount, _ = strconv.ParseFloat(row[1], 64)
invoice.OrderID, _ = strconv.Atoi(row[2])
unixTime, _ := strconv.ParseInt(row[3], 10, 64)
invoice.Date = time.Unix(unixTime, 0)
fmt.Printf("Received invoice `%v` for $ %.2f \n", invoice.InvoiceNumber, invoice.Amount)
}
}(f)
time.Sleep(10 * time.Millisecond)
}
}
type Invoice struct {
InvoiceNumber string
Amount float64
OrderID int
Date time.Time
}

Resources