Retry request in go-colly - go

I have this scraper library, I would like to change my user agent if the first user agent returns error, but this code doesnt work, if first user agent doesnt work, I have send the 2nd attempt but this will never finish since onHTML is not triggered:
package scraper
import (
"net/http"
"github.com/davecgh/go-spew/spew"
"github.com/gocolly/colly"
)
const (
fbUserAgent = "ua 1"
userAgent = "ua 2"
)
type ScrapeResult struct {
Title string
Description string
SiteName string
URL string
Images []string
}
func Scrape2(url string) (*ScrapeResult, error) {
var (
res *ScrapeResult
scrapeErr error
done = make(chan bool, 1)
c = colly.NewCollector()
)
c.OnError(func(r *colly.Response, err error) {
if ua := r.Request.Headers.Get("User-Agent"); ua == fbUserAgent {
c.Request(
"GET",
url,
nil,
nil,
http.Header{
"User-Agent": []string{userAgent},
"Accept": []string{"*/*"},
},
)
} else {
scrapeErr = err
done <- true
}
})
c.OnHTML("html", func(e *colly.HTMLElement) {
spew.Dump("ON HTML")
res = &ScrapeResult{URL: url}
res.Title = FindTitle(e)
res.Description = FindDescription(e)
res.SiteName = FindSiteName(e)
res.Images = FindImages(e)
done <- true
})
c.Request(
"GET",
url,
nil,
nil,
http.Header{
"User-Agent": []string{fbUserAgent},
"Accept": []string{"*/*"}, // * / *
"Accept-Language": []string{"en-GB,en-US;q=0.9,en;q=0.8"},
"Accept-Encoding": []string{"gzip, deflate, br"},
"Connection": []string{"keep-alive"},
"sec-ch-ua": []string{` Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90`},
},
)
<- done
return res, scrapeErr
}
func FindTitle(e *colly.HTMLElement) string {
if content := e.ChildAttr(`meta[property="og:title"]`, "content"); len(content) > 0 {
return content
}
return ""
}
func FindDescription(e *colly.HTMLElement) string {
if content := e.ChildAttr(`meta[property="og:description"]`, "content"); len(content) > 0 {
return content
}
return ""
}
func FindSiteName(e *colly.HTMLElement) string {
if content := e.ChildAttr(`meta[property="og:site_name"]`, "content"); len(content) > 0 {
return content
}
return ""
}
func FindImages(e *colly.HTMLElement) []string {
images := make([]string, 0)
if content := e.ChildAttr(`meta[property="og:image"]`, "content"); len(content) > 0 {
images = append(images, content)
}
return images
}
How can I make colly request for the 2nd time and trigger the onHTML? thank you

You can set the property collector.CheckHead = true
What this does is ensures that you do a GetHEAD operation first to check connection issues and if it fails - there is a retry.
you will need /v2 of gocolly to have this feature included.
https://github.com/gocolly/colly/blob/master/colly.go#L110

Related

Gin context variable overrridden with parallel REST api requests

go version: 1.19
gin version (or commit ref): 1.8.1
operating system: Ubuntu
I have a saas project which is based upon Rest APIs. All apis are developed in GO using gin package. When the user logs in then I set current user details in the request context so that I can access these details furthere to display some data. However I had a case in which 2 requests hits in parallel & the context values for the 1st request are override with the context values in the 2nd request. Due to this, my data is displaying wrong.
package main
import (
"fmt"
"strings"
"github.com/gin-gonic/gin"
"github.com/golang-jwt/jwt"
)
func main() {
g := gin.Default()
g.Use(ParseJWTToken)
g.GET("/hello/:name", hello)
g.Run(":9000")
}
func hello(c *gin.Context) {
c.Keys = make(map[string]interface{})
c.Keys["current_user_id"] = 10
c.Keys["current_user_name"] = c.Param("name")
fmt.Println(c.Keys)
c.String(200, "Hello %s", c.Param("name"))
}
var role, userName string
var userId float64
func ParseJWTToken(c *gin.Context) {
merchantDatabase := make(map[string]interface{})
if values, _ := c.Request.Header["Authorization"]; len(values) > 0 {
bearer := strings.Split(c.Request.Header["Authorization"][0], "Bearer")
bearerToken := strings.TrimSpace(bearer[1])
var userAgent string
var userAgentCheck bool
if values, _ := c.Request.Header["User-Agent"]; len(values) > 0 {
userAgent = values[0]
}
_ = config.InitKeys()
token, err := jwt.Parse(bearerToken, func(token *jwt.Token) (interface{}, error) {
return config.SignKey, nil
})
if err != nil {
c.Abort()
return
}
if !token.Valid {
c.Abort()
return
}
if len(token.Claims.(jwt.MapClaims)) > 0 {
for key, claim := range token.Claims.(jwt.MapClaims) {
if key == "user_agent" {
if claim == userAgent {
userAgentCheck = true
}
}
if key == "role" {
role = claim.(string)
}
if key == "id" {
userId = claim.(float64)
}
if key == "name" {
userName = claim.(string)
}
}
}
merchantDatabase["userid"] = userId
merchantDatabase["role"] = role
merchantDatabase["username"] = userName
c.Keys = merchantDatabase
if userAgentCheck {
c.Next()
} else {
c.Abort()
return
}
} else {
c.Abort()
return
}
}
This issue is not produced every time for parallel requests.
How can I fix that ?
I have used global variables for the details that were overridden. Declaring these inside the middleware fixed the issue. Find complete thread here: https://github.com/gin-gonic/gin/issues/3437

go-colly returning just the scrapping from the first page

I am trying to scrape a webpage with the colly framework, with the code below. The scrape it's okay, but it's just returning the scrape from the first page, meanwhile, there are three pages in this example.
The scraper is scrapping all three pages, but returning just the first one.
scraper.go:
package scraper
import (
"fmt"
"strings"
"github.com/gocolly/colly"
"github.com/gocolly/colly/extensions"
)
func Scraper(prod, baseUrl string) []Product {
// Replace blank spaces with '%20' to make the URL work
var urlReplacer *strings.Replacer = strings.NewReplacer(" ", "%20")
// Removes 'R$' from the price and remove blank spaces
var priceReplacer *strings.Replacer = strings.NewReplacer("R$", "", " ", "")
products := make([]Product, 0)
prod = urlReplacer.Replace(prod)
searchUrl := fmt.Sprintf("%s%s%s", baseUrl, "/pesquisa?pg=1&t=", prod)
c := colly.NewCollector()
s := Selector()
c.OnHTML(s.main, func(e *colly.HTMLElement) {
e.ForEach(s.container, func(_ int, h *colly.HTMLElement) {
fullPrice := h.ChildText(s.fullPrice)
product := Product{
name: h.ChildText(s.name),
fullPrice: priceReplacer.Replace(fullPrice),
link: fmt.Sprintf("%s%s", baseUrl, h.ChildAttr(s.link, "href")),
}
products = append(products, product)
})
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
c.OnError(func(r *colly.Response, err error) {
fmt.Println("Request URL:", r.Request.URL, "failed with response:", r.Request, "\nError:", err)
})
c.OnHTML(s.nextPage, func(e *colly.HTMLElement) {
nextPage := urlReplacer.Replace(fmt.Sprintf("%s%s", baseUrl, e.Attr("href")))
nextPageStr := e.Text
isNextPage := nextPageStr == "Próximo"
fmt.Println(nextPageStr)
if isNextPage {
c.Visit(nextPage)
} else {
e.Request.Abort()
}
})
c.OnScraped(func(r *colly.Response) {
fmt.Println("Finished", r.Request.URL)
fmt.Println(products)
})
// Uses a random User-Agent in each request
extensions.RandomUserAgent(c)
c.Visit(searchUrl)
return products
}
selector.go:
package scraper
type ProdSelector struct {
main string
container string
name string
fullPrice string
link string
nextPage string
}
// Export ProductSelector to be used like a constant, carrying the selectors of the website
func Selector() ProdSelector {
return ProdSelector{
main: "ul.row",
container: "div.inner",
name: "h3.name.no-medium.no-tablet",
fullPrice: "strong.sale-price > span:nth-child(1)",
link: "a",
nextPage: "a.page-next",
}
}
product.go:
package scraper
type Product struct {
name string
fullPrice string
link string
}
main.go:
package main
import (
"fmt"
"github.com/Antonio-Costa00/Go-Price-Monitor-Alibaba-Website/scraper"
)
func main() {
products := scraper.Scraper("Guitar", "https://www.brasiltronic.com.br")
fmt.Println(products)
}
I am not sure, but I think it's a problem related to asynchrony. May someone help me with this problem? Thank you for the attention.

Go gin nested JSON request body POST, error unexpected end of JSON input

I am new to GO and was trying to create a simple POST API with gin and gorm.
The request data is nested JSON like below:
{
"fall_orders_request": [
{
"fruit": "Watermelon",
"vegetable": "Carrot"
}
],
"spring_orders_request": [
{
"fruit": "Watermelon",
"vegetable": "Carrot",
"cupcake": "minions"
}
],
"custome_rates": [
{
"fruit": "Watermelon",
"greentea": "Japanese",
"cupcake": "pokemon"
}
]
}
After receiving the request i.e orders the backend will save it to corresponding Databases for each session.
This is my code for the orders.go:
package order
import (
"net/http"
"github.com/gin-gonic/gin"
"gorm.io/gorm"
)
type FallOrders struct {
ID uint `gorm:"primarykey"`
Fruit string `json:"fruit"`
Vegetable string `json:"vegetable"`
}
type SpringOrders struct {
ID uint `gorm:"primarykey"`
Fruit string `json:"fruit"`
Vegetable string `json:"vegetable"`
Cupcake string `json:"cupcake"`
}
type WinterOrders struct {
ID uint `gorm:"primarykey"`
Fruit string `json:"fruit"`
Greentea string `json:"greentea"`
Cupcake string `json:"cupcake"`
}
type allOrders struct {
FallOrders []FallOrders `json:"fall_orders"`
SpringOrders []SpringOrders `json:"spring_orders"`
WinterOrders []WinterOrders `json:"winter_orders"`
}
type FallOrdersRequest struct {
Fruit string `json:"fruit"`
Vegetable string `json:"vegetable"`
}
type SpringOrdersRequest struct {
Fruit string `json:"fruit"`
Vegetable string `json:"vegetable"`
Cupcake string `json:"cupcake"`
}
type WinterOrdersRequest struct {
Fruit string `json:"fruit"`
Greentea string `json:"greentea"`
Cupcake string `json:"cupcake"`
}
type AllOrdersRequest struct {
FallOrdersRequest []FallOrdersRequest `json:"fall_orders_request"`
SpringOrdersRequest []SpringOrdersRequest `json:"spring_orders_request"`
WinterOrdersRequest []WinterOrdersRequest `json:"winter_orders_request"`
}
type AllOrdersManager struct {
DB *gorm.DB
}
type FallOrdersManager struct {
DB *gorm.DB
}
type SpringOrdersManager struct {
DB *gorm.DB
}
type WinterOrdersManager struct {
DB *gorm.DB
}
func CreateModularRates() gin.HandlerFunc {
return func(c *gin.Context) {
var aor AllOrdersRequest
var form FallOrdersManager
var sorm SpringOrdersManager
var worm WinterOrdersManager
if err := c.BindJSON(&aor); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
for _, fall := range aor.FallOrdersRequest {
fallOrders := FallOrders{
Fruit: fall.Fruit,
Vegetable: fall.Vegetable,
}
c.JSON(http.StatusCreated, fallOrders)
res := form.DB.Create(&fallOrders)
if res.Error != nil {
return
}
}
for _, spring := range aor.SpringOrdersRequest {
springOrders := SpringOrders{
Fruit: spring.Fruit,
Vegetable: spring.Vegetable,
Cupcake: spring.Cupcake,
}
c.JSON(http.StatusCreated, springOrders)
res := sorm.DB.Create(&springOrders)
if res.Error != nil {
return
}
}
for _, winter := range aor.WinterOrdersRequest {
winterOrders := WinterOrders{
Fruit: winter.Fruit,
Greentea: winter.Greentea,
Cupcake: winter.Cupcake,
}
c.JSON(http.StatusCreated, winterOrders)
res := worm.DB.Create(&winterOrders)
if res.Error != nil {
return
}
}
}
}
And this is the automated test orders_test.go
package order
import (
"bytes"
"encoding/json"
"net/http"
"net/http/httptest"
"testing"
"github.com/gin-gonic/gin"
"github.com/stretchr/testify/assert"
)
func TestOrderData() order.AllOrdersRequest {
fall_orders_request := []order.FallOrdersRequest{}
spring_orders_request := []order.SpringOrdersRequest{}
winter_orders_request := []order.WinterOrdersRequest{}
fall_orders_request = append(fall_orders_request, order.FallOrdersRequest{
Fruit: "Watermelon",
Vegetable: "Carrot",
})
spring_orders_request = append(spring_orders_request, order.spring_orders_request{
Fruit: "Watermelon",
Vegetable: "Carrot",
Cupcake: "minions",
})
winter_orders_request = append(winter_orders_request, order.winter_orders_request{
Fruit: "Watermelon",
Greentea: "Japanese",
Cupcake: "pokemon",
})
return order.AllOrdersRequest{
fall_orders_request: fall_orders_request,
spring_orders_request: spring_orders_request,
winter_orders_request: winter_orders_request,
}
}
func TestOrderCreation(t *testing.T) {
params := TestOrderData()
jsonPayload, _ := json.Marshal(params)
w := httptest.NewRecorder()
req, _ := http.NewRequest(
"POST",
"/orders",
bytes.NewReader(jsonPayload),
)
var c *gin.Context
assert.Equal(t, 201, w.Code)
err2 := c.ShouldBindJSON(&req)
if err2 == nil {
return
}
}
After running the test I get the following error:
Error: unexpected end of JSON input
{"message":"Error #01: EOF\n"}
Logging the request shows the request body is JSON as expected but not sure why I am getting this error.
If you are already in the order package you don't need it to reference it in each place you can directly access the method defined in the order package
Here struct name are incorrect order.spring_orders_request, order.winter_orders_request it should be order.SpringOrdersRequest, order.WinterOrdersRequest
spring_orders_request = append(spring_orders_request, order.spring_orders_request{
Fruit: "Watermelon",
Vegetable: "Carrot",
Cupcake: "minions",
})
winter_orders_request = append(winter_orders_request, order.winter_orders_request{
Fruit: "Watermelon",
Greentea: "Japanese",
Cupcake: "pokemon",
})
The key here are wrong is fall_orders_request, spring_orders_request, winter_orders_request it should be FallOrdersRequest, SpringOrdersRequest, WinterOrdersRequest
return order.AllOrdersRequest{
fall_orders_request: fall_orders_request,
spring_orders_request: spring_orders_request,
winter_orders_request: winter_orders_request,
}
Final:
package order
import (
"fmt"
"bytes"
"encoding/json"
"net/http"
"net/http/httptest"
"testing"
// "github.com/gin-gonic/gin"
"github.com/stretchr/testify/assert"
)
func OrderData() AllOrdersRequest {
fall_orders_request := []FallOrdersRequest{}
spring_orders_request := []SpringOrdersRequest{}
winter_orders_request := []WinterOrdersRequest{}
fall_orders_request = append(fall_orders_request, FallOrdersRequest{
Fruit: "Watermelon",
Vegetable: "Carrot",
})
spring_orders_request = append(spring_orders_request, SpringOrdersRequest{
Fruit: "Watermelon",
Vegetable: "Carrot",
Cupcake: "minions",
})
winter_orders_request = append(winter_orders_request, WinterOrdersRequest{
Fruit: "Watermelon",
Greentea: "Japanese",
Cupcake: "pokemon",
})
return AllOrdersRequest{
FallOrdersRequest: fall_orders_request,
SpringOrdersRequest: spring_orders_request,
WinterOrdersRequest: winter_orders_request,
}
}
func TestOrderCreation(t *testing.T) {
params := OrderData()
jsonPayload, _ := json.Marshal(params)
// fmt.Println(jsonPayload)
_bytes := bytes.NewReader(jsonPayload)
// default `ResponseRecorder` `http.Response` status is 200
// we need to update it to 201 before we access it in `assert`
w := httptest.NewRecorder()
w.WriteHeader(201)
contentLength, err := w.Write(jsonPayload)
fmt.Println(contentLength, err)
req, _ := http.NewRequest(
"POST",
"http://localhost:8080/orders",
_bytes,
)
assert.Equal(t, 201, w.Code)
res := w.Result()
fmt.Println(req)
fmt.Println(res)
// Not sure what you are trying to do here but since there is nothing
// in the context and req variable is already defined of `http.Request` type
// below statements doesn't make sense.
// var c *gin.Context
// if err := c.ShouldBindJSON(&req); err != nil {
// return
// }
}

go-colly: How can I get HTML title in c.OnResponse, so I can fill the struct?

How can I get HTML.title in c.OnResponse - or is there a better alternative to fill the Struct with url/title/content
At the end I need to fill the below struct and post it to elasticsearch.
type WebPage struct {
Url string `json:"url"`
Title string `json:"title"`
Content string `json:"content"`
}
// Print the response
c.OnResponse(func(r *colly.Response) {
pageCount++
log.Println(r.Headers)
webpage := WebPage{
Url: r.Ctx.Get("url"), //- can be put in ctx c.OnRequest, and r.Ctx.Get("url")
Title: "my title", //string(r.title), // Where to get this?
Content: string(r.Body), //string(r.Body) - can be done in c.OnResponse
}
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
enc.Encode(webpage) // SEND it to elasticsearch
log.Println(fmt.Sprintf("%d DONE Visiting : %s", pageCount, urlVisited))
})
I can get title in method like below, however Ctx is not available so I cant put the "title" value in Ctx. Other options?
c.OnHTML("title", func(e *colly.HTMLElement) {
fmt.Println(e.Text)
e.Ctx.Put("title", e.Text) // NOT ACCESSIBLE!
})
Logs
2020/05/07 17:42:37 7 DONE Visiting : https://www.coursera.org/learn/build-portfolio-website-html-css
{
"url": "https://www.coursera.org/learn/build-portfolio-website-html-css",
"title": "my page title",
"content": "page html body bla "
}
2020/05/07 17:42:37 8 DONE Visiting : https://www.coursera.org/learn/build-portfolio-website-html-css
{
"url": "https://www.coursera.org/browse/social-sciences",
"title": "my page title",
"content": "page html body bla "
}
I created a global variable of that struct and kept filling it in different methods
Not sure if this is the best way.
fun main(){
....
webpage := WebPage{} //Is this a right way to declare a mutable struct?
c.OnRequest(func(r *colly.Request) { // url
webpage.Url = r.URL.String() // Is this the right way to mutate?
})
c.OnResponse(func(r *colly.Response) { //get body
pageCount++
log.Println(fmt.Sprintf("%d DONE Visiting : %s", pageCount, webpage.Url))
})
c.OnHTML("head title", func(e *colly.HTMLElement) { // Title
webpage.Title = e.Text
})
c.OnHTML("html body", func(e *colly.HTMLElement) { // Body / content
webpage.Content = e.Text // Can url title body be misrepresented in multithread scenario?
})
c.OnHTML("a[href]", func(e *colly.HTMLElement) { // href , callback
link := e.Attr("href")
e.Request.Visit(link)
})
c.OnError(func(r *colly.Response, err error) { // Set error handler
log.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
})
c.OnScraped(func(r *colly.Response) { // DONE
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
enc.Encode(webpage)
})
I based my work on Espresso's answer...
I just get the whole html in the function and then query the head and body in it so everything is nice and encapsulated into one "c.OnHTML"
c2.OnHTML("html", func(html *colly.HTMLElement) {
slug := strings.Split(html.Request.URL.String(), "/")[4]
title := ""
descr := ""
h1 := ""
html.ForEach("head", func(_ int, head *colly.HTMLElement) {
title += head.ChildText("title")
head.ForEach("meta", func(_ int, meta *colly.HTMLElement) {
if meta.Attr("name") == "description" {
descr += meta.Attr("content")
}
})
})
html.ForEach("h1", func(_ int, h1El *colly.HTMLElement) {
h1 += h1El.Text
})
//Now you can do stuff with your elements from head and body
})

How to write benchmark test in Golang for an cmd tool?

When I say cmd tool, I mean a tool program like:
var (
m = flag.String("m", "GET", "")
headers = flag.String("h", "", "")
body = flag.String("d", "", "")
contentType = flag.String("T", "text/html", "")
......
)
func main() {
// Using args above, create complex logics
}
I cannot just call main() in my Benchmark().
Is there any way to do this?
Or must I rewrite a function with args for Benchmark() to call?
It's a good practice to keep your main() as short as possible and delegate all the actual work to a function/method, which makes it testable and benchmark'able if you wish. It may look something along the lines of
// main.go
// go run main.go -h "Content-Type: application/json" -h "X-Request-ID: 12345" -d "Hello"
package main
import (
"flag"
"fmt"
"io"
"log"
"strings"
)
type headers map[string]string
func (h headers) String() string {
hdr := make([]string, 0)
for k, v := range map[string]string(h) {
hdr = append(hdr, fmt.Sprintf("%s: %s", k, v))
}
return strings.Join(hdr, ",")
}
func (h headers) Set(value string) error {
i := strings.Index(value, ":")
if i < 1 {
return fmt.Errorf("invalid header '%s'", value)
}
h[value[0:i]] = strings.TrimSpace(value[i+1:])
return nil
}
var (
method string
hdrs = make(headers)
body string
)
func init() {
flag.StringVar(&method, "m", "GET", "Method")
flag.Var(hdrs, "h", "Headers")
flag.StringVar(&body, "d", "", "Body")
flag.Parse()
}
func main() {
err := run(method, hdrs, strings.NewReader(body))
if err != nil {
log.Fatal(err)
}
}
func run(method string, headers headers, body io.Reader) error {
// implement your logic here
return nil
}
and your benchmark
// main_test.go
package main
import (
"strings"
"testing"
)
func BenchmarkRun1(b *testing.B) {
hdrs := headers{"Content-Type": "application/json", "Accept": "appication/json"}
body := "Hello World"
for i := 0; i < b.N; i++ {
run("GET", hdrs, strings.NewReader(body))
}
}

Resources