How to run filepath.Walkfunc as a goroutine - performance

I'm trying to parse a large image dataset. I'm using filepath.Walk ]and processing each file I find there. I'd like the filepath.
package main
import (
"fmt"
"image/color"
"image/png"
"math/rand"
"os"
)
var (
Black = color.Gray{0}
)
func getRandFloatNumber(min, max float32) float32 {
return (rand.Float32()*2 - min) * max
}
func openImage(path string, info os.FileInfo, err error) error {
infile, _ := os.Open(path)
defer infile.Close()
img, err := png.Decode(infile)
if err != nil {
return nil
}
array := make([]float32, 128*128)
for y := 0; y < 128; y++ {
for x := 0; x < 128; x++ {
c := color.GrayModel.Convert(img.At(x, y)).(color.Gray)
if c == Black {
array[x*y] = getRandFloatNumber(0.7, 0.95)
} else {
array[x*y] = getRandFloatNumber(0.1, 0.25)
}
}
}
fmt.Println(info.Name())
return nil
}
How to run openImage as a gorutine?
Or how to optimize this code?

You can't get filepath.Walk to call your function in a goroutine, but you can simply start a goroutine in your WalkFunc.
package main
import (
"os"
"path/filepath"
)
func main() {
filepath.Walk("/my/dir", func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if info.IsDir() {
return nil
}
// Check more criteria if necessary. Also consider limiting the number
// of concurrent goroutines.
go openImage(path, info)
return nil
})
}
func openImage(path string, info os.FileInfo) {
}

Related

Why is my Go program so slow when navigating the files

Why is this program so slow? I thought the code was fairly optimized, but it takes significantly long than the find command when use on my root filesystem.
It takes about 4 minutes, as opposed to the find command which takes about 40 seconds.
I tried removing the sorting algorithm, but doesn't speed up the program.
package main
import (
"fmt"
"io"
"io/fs"
"log"
"os"
"sort"
"sync"
"github.com/google/fscrypt/filesystem"
"github.com/sirupsen/logrus"
"gopkg.in/alecthomas/kingpin.v2"
)
var (
mountpoint = kingpin.Flag("mount", "The mount to find the largest file usages. Can be a subath of mount").Required().String()
limit = kingpin.Flag("limit", "The maximum number of files return to the display").Default("10").Short('l').Int()
)
var device string
type fileDisplay struct {
Size int64
Path string
}
type bySize []fileDisplay
func (a bySize) Len() int { return len(a) }
func (a bySize) Less(i, j int) bool { return a[i].Size < a[j].Size }
func (a bySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
var fileChan = make(chan fileDisplay)
var files []fileDisplay
func main() {
log.SetOutput(io.Discard)
kingpin.Version("0.0.1")
kingpin.Parse()
//Define limit after parsing
logrus.SetLevel(logrus.FatalLevel)
if (*mountpoint)[len(*mountpoint)-1:] != "/" {
*mountpoint = *mountpoint + "/"
}
fmt.Println("Finding the top", *limit, "largest files on filesystem", *mountpoint, "\n================================================")
mount, err := filesystem.FindMount(*mountpoint)
if err != nil {
logrus.Fatal(err)
}
device = mount.Device
entries, err := os.ReadDir(*mountpoint)
if err != nil {
logrus.Fatal(err)
}
var wg sync.WaitGroup
getFiles(*mountpoint, entries, &wg)
go func() {
defer close(fileChan)
wg.Wait()
}()
var last int64
for file := range fileChan {
if file.Size > last {
files = append(files, file)
} else {
files = append([]fileDisplay{file}, files...)
}
}
sort.Sort(bySize(files))
var shortFiles []fileDisplay
if len(files) > *limit {
shortFiles = files[len(files)-*limit:]
} else {
shortFiles = files
}
for _, file := range shortFiles {
fmt.Println(file.Path, file.DisplaySizeIEC())
}
}
func getFiles(start string, entries []fs.DirEntry, wg *sync.WaitGroup) {
for _, entry := range entries {
wg.Add(1)
go handleEntry(start, entry, wg)
}
}
func handleEntry(start string, entry fs.DirEntry, wg *sync.WaitGroup) {
defer wg.Done()
var file fileDisplay
mount, err := filesystem.FindMount(start + entry.Name())
if err != nil {
logrus.Fatalln(err, start+entry.Name())
return
}
if mount.Device == device {
if entry.Type().IsRegular() {
fileInfo, err := os.Stat(start + entry.Name())
if err != nil {
logrus.Fatalln(err, start+entry.Name())
return
}
file.Path = start + entry.Name()
file.Size = fileInfo.Size()
fileChan <- file
} else if entry.IsDir() {
entries, err := os.ReadDir(start + entry.Name())
if err != nil {
logrus.Fatalln(err, start+entry.Name())
return
}
logrus.Info("Searching ", start+entry.Name())
getFiles(start+entry.Name()+"/", entries, wg)
}
}
}
func (f *fileDisplay) DisplaySizeIEC() string {
const unit = 1024
b := f.Size
if b < unit {
return fmt.Sprintf("%dB", b)
}
div, exp := int64(unit), 0
for n := b / unit; n >= unit; n /= unit {
div *= unit
exp++
}
return fmt.Sprintf("%.2f%ciB",
float64(b)/float64(div), "KMGTPE"[exp])
}
Edit: I tried removing the channel and just appending to the slice. This sped it up, but it's not safe because multiple routines could be accessing it.
My final draft involved dropping the channel and using sync.RWMutex to lock the list and a custom append function to append with the lock. This allowed me to drop the channel and use append without risking multiple routines editing the same slice.
I dropped the channel because this was causing routines to stay open until the for loop over the open channel could reach their message. My channek operations were blocking. So the routines caused it to slow to the speed of the for loop iterating over the channel.
You can see the differences here:
package main
import (
"fmt"
"io"
"io/fs"
"log"
"os"
"sort"
"sync"
"github.com/google/fscrypt/filesystem"
"github.com/sirupsen/logrus"
"gopkg.in/alecthomas/kingpin.v2"
)
var (
mountpoint = kingpin.Flag("mount", "The mount to find the largest file usages. Can be a subath of mount").Required().String()
limit = kingpin.Flag("limit", "The maximum number of files return to the display").Default("10").Short('l').Int()
)
var device string
type fileDisplays struct {
sync.RWMutex
Files []fileDisplay
}
var files fileDisplays
type fileDisplay struct {
Size int64
Path string
}
type bySize []fileDisplay
func (a bySize) Len() int { return len(a) }
func (a bySize) Less(i, j int) bool { return a[i].Size < a[j].Size }
func (a bySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func main() {
log.SetOutput(io.Discard)
kingpin.Version("0.0.1")
kingpin.Parse()
//Define limit after parsing
logrus.SetLevel(logrus.FatalLevel)
if (*mountpoint)[len(*mountpoint)-1:] != "/" {
*mountpoint = *mountpoint + "/"
}
fmt.Println("Finding the top", *limit, "largest files on filesystem", *mountpoint, "\n================================================")
mount, err := filesystem.FindMount(*mountpoint)
if err != nil {
logrus.Fatal(err)
}
device = mount.Device
entries, err := os.ReadDir(*mountpoint)
if err != nil {
logrus.Fatal(err)
}
var wg sync.WaitGroup
getFiles(*mountpoint, entries, &wg)
wg.Wait()
sort.Sort(bySize(files.Files))
var shortFiles []fileDisplay
if len(files.Files) > *limit {
shortFiles = files.Files[len(files.Files)-*limit:]
} else {
shortFiles = files.Files
}
for _, file := range shortFiles {
fmt.Println(file.Path, file.DisplaySizeIEC())
}
}
func getFiles(start string, entries []fs.DirEntry, wg *sync.WaitGroup) {
for _, entry := range entries {
wg.Add(1)
go handleEntry(start, entry, wg)
}
}
func handleEntry(start string, entry fs.DirEntry, wg *sync.WaitGroup) {
defer wg.Done()
var file fileDisplay
mount, err := filesystem.FindMount(start + entry.Name())
if err != nil {
logrus.Errorln(err, start+entry.Name())
return
}
if mount.Device == device {
if entry.Type().IsRegular() {
fileInfo, err := os.Stat(start + entry.Name())
if err != nil {
logrus.Errorln(err, start+entry.Name())
return
}
file.Path = start + entry.Name()
file.Size = fileInfo.Size()
files.Append(file)
} else if entry.IsDir() {
entries, err := os.ReadDir(start + entry.Name())
if err != nil {
logrus.Errorln(err, start+entry.Name())
return
}
logrus.Info("Searching ", start+entry.Name())
getFiles(start+entry.Name()+"/", entries, wg)
}
}
}
func (f *fileDisplay) DisplaySizeIEC() string {
const unit = 1024
b := f.Size
if b < unit {
return fmt.Sprintf("%dB", b)
}
div, exp := int64(unit), 0
for n := b / unit; n >= unit; n /= unit {
div *= unit
exp++
}
return fmt.Sprintf("%.2f%ciB",
float64(b)/float64(div), "KMGTPE"[exp])
}
func (fd *fileDisplays) Append(item fileDisplay) {
fd.Lock()
defer fd.Unlock()
fd.Files = append(fd.Files, item)
}

How can I destruct this in Golang?

It may be a stupid question because I just learned Golang. I hope you understand.
I am making a program to extract data from the homepage using the goquery package:
package main
import (
"fmt"
"log"
"net/http"
"github.com/PuerkitoBio/goquery"
)
var url string = "https://www.jobkorea.co.kr/Search/?stext=golang&tabType=recruit&Page_No=3"
func main() {
getPages()
}
func getPages() int {
res, err := http.Get(url)
checkErr(err)
checkCode(res)
defer res.Body.Close()
doc, err := goquery.NewDocumentFromReader(res.Body)
checkErr(err)
doc.Find(".tplPagination").Each(func(i int, s *goquery.Selection) {
fmt.Println(s.Find("a"))
})
return 0
}
func checkErr(err error) {
if err != nil {
log.Fatalln(err)
fmt.Println(err)
}
}
func checkCode(res *http.Response) {
if res.StatusCode != 200 {
log.Fatalln("Request failed with statusCode:", res.StatusCode)
}
}
It prints below:
&{[0x140002db0a0 0x140002db570 0x140002db810 0x140002dbd50 0x140002dc000 0x140002dc2a0 0x140002dc540 0x140002dc850] 0x140000b2438 0x14000305680}
&{[0x140002dcd90 0x140002dd810] 0x140000b2438 0x14000305710}
But I just want to print only the first array out. Like this:
[0x140002dcd90 0x140002dd810]
How can I destruct them?
The problem is that you are printing as result is matched.
You can save the *goquery.Selection in a new slice and print only the last element. This example is working because you want the last occurrence, but in real life you must parse the query result for something in specific to not depend about result order.
// type Selection struct {
// Nodes []*html.Node
// document *Document
// prevSel *Selection
// }
var temp []*goquery.Selection
temp = append(temp, doc.Find(".tplPagination").Each(func(i int, s *goquery.Selection) {
s.Find("a")
}))
fmt.Printf("last: %v\n", temp[len(temp)-1])
temp[len(temp)-1]: &{[0xc0002dcd90 0xc0002e0a80] 0xc00000e3f0 0xc000309770}
The Nodes []*html.Node can be accessed with same example:
fmt.Printf("last: %v\n", temp[len(temp)-1].Nodes)
As per your comment you were looking to parse the page and get the number of pages and number of posts. Here is my attempt:
package main
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"log"
"math"
"net/http"
"strconv"
"strings"
)
func errCheck(err error) {
if err != nil {
log.Fatal(err)
}
}
func ExampleScrape() {
url := "https://www.jobkorea.co.kr/Search/?stext=golang&tabType=recruit&Page_No=%s"
page := 3
fmt.Println("Current page:", page)
res, err := http.Get(fmt.Sprintf(url, page))
errCheck(err)
defer res.Body.Close()
if res.StatusCode != 200 {
log.Fatalf("status code error: %d %s", res.StatusCode, res.Status)
}
doc, err := goquery.NewDocumentFromReader(res.Body)
errCheck(err)
posts_div := doc.Find(".recruit-info div.dev_list.lists-cnt")
total_count_div := posts_div.Nodes[0]
var total_count int
for _, a := range total_count_div.Attr {
if a.Key == "total-count" {
total_count, err = strconv.Atoi(a.Val)
errCheck(err)
break
}
}
fmt.Println("Total count:", total_count)
titles := posts_div.Find(".list-post .title")
fmt.Println("On this page:", len(titles.Nodes))
fmt.Println("Pages:", math.Ceil(float64(total_count)/float64(len(titles.Nodes))))
fmt.Println("\nTitles on this page:")
titles.Each(func(i int, s *goquery.Selection) {
fmt.Println("\t-", strings.TrimSpace(s.Text()))
})
}
func main() {
ExampleScrape()
}

Golang segment stdin stream to file

instead of writing a pipe to a huge file i want to segment the stream in chunks on signal USR1. i think i got the basics working but the app just hangs and nothing happens, any clues or best practices when handling with an uncontrollable input stream and byte perfect segmentation?
package main
import (
"bufio"
"fmt"
"io"
"os"
"os/signal"
"syscall"
"time"
)
var done bool
func handle(c chan os.Signal) {
for {
sig := <-c
switch sig {
case syscall.SIGUSR1:
fmt.Println("###Sink temporarily_closed###")
done = true
case syscall.SIGUSR2:
fmt.Println("###Sink closed###")
done = true
case syscall.SIGHUP:
fmt.Println("###Sink running###")
}
}
}
func check(e error) {
if e != nil {
panic(e)
}
}
func main() {
c := make(chan os.Signal, 1)
signal.Notify(c, syscall.SIGUSR1, syscall.SIGUSR2, syscall.SIGHUP)
go handle(c)
reader := bufio.NewReaderSize(os.Stdin,1024*10)
for true {
if done {
file, err := os.Create("./temp.file")
check(err)
writer := bufio.NewWriter(file)
written, err := io.Copy(writer,reader)
check(err)
fmt.Println(written)
writer.Flush()
file.Close()
reader.Reset(os.Stdin)
done = false
}
time.Sleep(time.Millisecond)
}
}
So you need to io.CopyN(dst, src, 4096) in the loop and rotate the file once in a while. See example. I made rotation by size but it is easy to add signal handling.
package main
import (
"fmt"
"io"
"log"
"os"
"time"
)
var count int
var f *os.File
func rotate() *os.File {
if f != nil {
if err := f.Close(); err != nil {
log.Fatal(err)
}
}
fname := fmt.Sprintf("./dump-%d.bin", count)
count++
f, err := os.Create(fname)
if err != nil {
log.Fatal(err)
}
log.Println("rotated:", fname)
return f
}
func main() {
var n, written int
reader := os.Stdin
for {
if written == 0 || written >= 4096*10 {
f = rotate()
written = 0
}
n, err := io.CopyN(f, reader, 4096)
if err != nil {
log.Fatal(err)
}
written += n
log.Println("written:", written)
time.Sleep(time.Millisecond * 500)
}
}

Replacement for *os.File of Go default log

I'm trying to write a wrapper for Go's built-in logger.
This is to have compatibility.
package main
import (
"log"
"os"
)
var(
mylog *log.Logger
)
func main() {
mylog = log.New(os.Stdout, "", 0)
mylog.Printf("test")
}
Instead of using os.Stdout, I want to create one something. Similar to os.Stdout but prints with prefix like below.
package main
import(
"log"
"mylibrary"
)
var(
mylog *log.Logger
)
func main() {
mylog = log.New(mylibrary.Prefix, "", 0)
mylog.Printf("test")
}
Basically, I still want to have *log.Logger while having custom log. Can someone give me a hint how I can make this works?
Currently, I'm using following to do that. But I bet there's a better way.
func NewIoWriter(f func(string)) *io.PipeWriter {
r, w := io.Pipe()
go func() {
scanner := bufio.NewScanner(r)
for scanner.Scan() {
f(scanner.Text())
}
if err := scanner.Err(); err != nil {
f(err.Error())
}
r.Close()
}()
runtime.SetFinalizer(w, (*io.PipeWriter).Close)
return w
}
What would be the better way to make it work?
Thank you
How about something like this:
type myLogWriter struct {
logFunc func(string)
line string
}
func (w *myLogWriter) Write(b []byte) (int, error) {
l := len(b)
for len(b) != 0 {
i := bytes.Index(b, []byte{'\n'})
if i == -1 {
w.line += string(b)
break
} else {
w.logFunc(w.line + string(b[:i]))
b = b[i+1:]
w.line = ""
}
}
return l, nil
}
func NewLogWriter(f func(string)) *myLogWriter {
return &myLogWriter{
logFunc: f,
}
}
See https://play.golang.org/p/L6PG1gCK1er.

inconsistent results using golang channels

I a task written in Go to get a unique list from a bunch of text files. I put in some parallelization using channels and am having inconsistent results now - a variance of 5 records output/not output each time with the same input files.
The am testing it with go run process.go | wc -l on Fedora x86_64, go1.1.2, 8 core amd.
The code is:
package main
import (
"fmt"
"os"
"io"
"encoding/csv"
"regexp"
"log"
)
var (
cleanRe *regexp.Regexp = regexp.MustCompile("[^0-9]+")
comma rune ='\t'
fieldsPerRecord=-1
)
func clean(s string) string {
clean:=cleanRe.ReplaceAllLiteralString(s,"")
if len(clean)<6 {return ""}
return clean
}
func uniqueChannel(inputChan chan []string, controlChan chan string) {
defer func(){controlChan<-"Input digester."}()
uniq:=make(map[string]map[string]bool)
i:=0
for record:= range inputChan {
i++
id,v:=record[0],record[1]
if uniq[id]==nil {
uniq[id]=make(map[string]bool)
} else if !uniq[id][v] {
uniq[id][v]=true
fmt.Println(id,string(comma),v)
}
}
log.Println("digest ", i)
}
func processFile(fileName string, outputChan chan []string, controlChan chan string) {
defer func(){controlChan<-fileName}()
f,err:=os.Open(fileName)
if err!=nil{log.Fatal(err)}
r:=csv.NewReader(f)
r.FieldsPerRecord = fieldsPerRecord
r.Comma = comma
// Process the records
i:=0
for record,err:=r.Read();err!=io.EOF;record,err=r.Read() {
if err!=nil{continue}
id:=record[0]
for _,v:=range record[1:] {
if cleanV:=clean(v);cleanV!=""{
i++
outputChan<-[]string{id,cleanV}
}
}
}
log.Println(fileName,i)
}
func main() {
inputs:=[]string{}
recordChan:=make(chan []string,100)
processesLeft:=len(inputs)+1
controlChan:=make(chan string,processesLeft)
// Ingest the inputs
for _,fName:=range inputs {
go processFile(fName,recordChan,controlChan)
}
// This is the loop to ensure it's all unique
go uniqueChannel(recordChan,controlChan)
// Make sure all the channels close up
for processesLeft>0 {
if processesLeft==1{
close(recordChan)
}
c:=<-controlChan
log.Println(c)
processesLeft--
}
close(controlChan)
}
It seems like it closes the channel before it's empty and quite. Without the closing mechanism I was getting deadlocks - I'm out of ideas.
You could ditch the control channel and use a sync.WaitGroup:
package main
import (
"encoding/csv"
"fmt"
"io"
"log"
"os"
"regexp"
"sync"
)
var (
cleanRe *regexp.Regexp = regexp.MustCompile("[^0-9]+")
comma rune = '\t'
fieldsPerRecord = -1
)
func clean(s string) string {
clean := cleanRe.ReplaceAllLiteralString(s, "")
if len(clean) < 6 {
return ""
}
return clean
}
func uniqueChannel(inputChan chan []string) {
uniq := make(map[string]map[string]bool)
i := 0
for record := range inputChan {
i++
id, v := record[0], record[1]
if uniq[id] == nil {
uniq[id] = make(map[string]bool)
} else if !uniq[id][v] {
uniq[id][v] = true
fmt.Println(id, string(comma), v)
}
}
log.Println("digest ", i)
}
func processFile(fileName string, outputChan chan []string) {
f, err := os.Open(fileName)
if err != nil {
log.Fatal(err)
}
r := csv.NewReader(f)
r.FieldsPerRecord = fieldsPerRecord
r.Comma = comma
// Process the records
for record, err := r.Read(); err != io.EOF; record, err = r.Read() {
if err != nil {
continue
}
id := record[0]
for _, v := range record[1:] {
if cleanV := clean(v); cleanV != "" {
outputChan <- []string{id, cleanV}
}
}
}
}
func main() {
inputs := []string{"ex.tsv"}
recordChan := make(chan []string)
var wg sync.WaitGroup
// Ingest the inputs
for _, fName := range inputs {
wg.Add(1)
go func() {
processFile(fName, recordChan)
wg.Done()
}()
}
go func() {
wg.Wait()
close(recordChan)
}()
// This is the loop to ensure it's all unique
uniqueChannel(recordChan)
}

Resources