How do I read in a large flat file - go

I have a flat file that has 339276 line of text in it for a size of 62.1 MB. I am attempting to read in all the lines, parse them based on some conditions I have and then insert them into a database.
I originally attempted to use a bufio.Scan() loop and bufio.Text() to get the line but I was running out of buffer space. I switched to using bufio.ReadLine/ReadString/ReadByte (I tried each) and had the same problem with each. I didn't have enough buffer space.
I tried using read and setting the buffer size but as the document says it actually a const that can be made smaller but never bigger that 64*1024 bytes. I then tried to use File.ReadAt where I set the starting postilion and moved it along as I brought in each section to no avail. I have looked at the following examples and explanations (not an exhaustive list):
Read text file into string array (and write)
How to Read last lines from a big file with Go every 10 secs
reading file line by line in go
How do I read in an entire file (either line by line or the whole thing at once) into a slice so I can then go do things to the lines?
Here is some code that I have tried:
file, err := os.Open(feedFolder + value)
handleError(err)
defer file.Close()
// fileInfo, _ := file.Stat()
var linesInFile []string
r := bufio.NewReader(file)
for {
path, err := r.ReadLine("\n") // 0x0A separator = newline
linesInFile = append(linesInFile, path)
if err == io.EOF {
fmt.Printf("End Of File: %s", err)
break
} else if err != nil {
handleError(err) // if you return error
}
}
fmt.Println("Last Line: ", linesInFile[len(linesInFile)-1])
Here is something else I tried:
var fileSize int64 = fileInfo.Size()
fmt.Printf("File Size: %d\t", fileSize)
var bufferSize int64 = 1024 * 60
bytes := make([]byte, bufferSize)
var fullFile []byte
var start int64 = 0
var interationCounter int64 = 1
var currentErr error = nil
for currentErr != io.EOF {
_, currentErr = file.ReadAt(bytes, st)
fullFile = append(fullFile, bytes...)
start = (bufferSize * interationCounter) + 1
interationCounter++
}
fmt.Printf("Err: %s\n", currentErr)
fmt.Printf("fullFile Size: %s\n", len(fullFile))
fmt.Printf("Start: %d", start)
var currentLine []string
for _, value := range fullFile {
if string(value) != "\n" {
currentLine = append(currentLine, string(value))
} else {
singleLine := strings.Join(currentLine, "")
linesInFile = append(linesInFile, singleLine)
currentLine = nil
}
}
I am at a loss. Either I don't understand exactly how the buffer works or I don't understand something else. Thanks for reading.

bufio.Scan() and bufio.Text() in a loop perfectly works for me on a files with much larger size, so I suppose you have lines exceeded buffer capacity. Then
check your line ending
and which Go version you use path, err :=r.ReadLine("\n") // 0x0A separator = newline? Looks like func (b *bufio.Reader) ReadLine() (line []byte, isPrefix bool, err error) has return value isPrefix specifically for your use case
http://golang.org/pkg/bufio/#Reader.ReadLine

It's not clear that it's necessary to read in all the lines before parsing them and inserting them into a database. Try to avoid that.
You have a small file: "a flat file that has 339276 line of text in it for a size of 62.1 MB." For example,
package main
import (
"bytes"
"fmt"
"io"
"io/ioutil"
)
func readLines(filename string) ([]string, error) {
var lines []string
file, err := ioutil.ReadFile(filename)
if err != nil {
return lines, err
}
buf := bytes.NewBuffer(file)
for {
line, err := buf.ReadString('\n')
if len(line) == 0 {
if err != nil {
if err == io.EOF {
break
}
return lines, err
}
}
lines = append(lines, line)
if err != nil && err != io.EOF {
return lines, err
}
}
return lines, nil
}
func main() {
// a flat file that has 339276 lines of text in it for a size of 62.1 MB
filename := "flat.file"
lines, err := readLines(filename)
fmt.Println(len(lines))
if err != nil {
fmt.Println(err)
return
}
}

It seems to me this variant of readLines is shorter and faster than suggested peterSO
func readLines(filename string) (map[int]string, error) {
lines := make(map[int]string)
data, err := ioutil.ReadFile(filename)
if err != nil {
return nil, err
}
for n, line := range strings.Split(string(data), "\n") {
lines[n] = line
}
return lines, nil
}

package main
import (
"fmt"
"os"
"log"
"bufio"
)
func main() {
FileName := "assets/file.txt"
file, err := os.Open(FileName)
if err != nil {
log.Fatal(err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
fmt.Println(scanner.Text())
}
}

Related

Ignore a line containing a pattern from a long text file in Go

I'm trying to implement a function to ignore a line containing a pattern from a long text file (ASCII guaranteed) in Go
The functions I have below withoutIgnore and withIgnore, both take a filename argument input and return a *byte.Buffer, which can be subsequently used to write to a io.Writer.
The withIgnore function takes an additional argument pattern to exclude the line containing the pattern from the file. The function works, but with benchmarking, found it to be 5x slower than withoutIgnore. Is there a way it could be improved?
package main
import (
"bufio"
"bytes"
"io"
"log"
"os"
)
func withoutIgnore(f string) (*bytes.Buffer, error) {
rfd, err := os.Open(f)
if err != nil {
log.Fatal(err)
}
defer func() {
if err := rfd.Close(); err != nil {
log.Fatal(err)
}
}()
inputBuffer := make([]byte, 1048576)
var bytesRead int
var bs []byte
opBuffer := bytes.NewBuffer(bs)
for {
bytesRead, err = rfd.Read(inputBuffer)
if err == io.EOF {
return opBuffer, nil
}
if err != nil {
return nil, nil
}
_, err = opBuffer.Write(inputBuffer[:bytesRead])
if err != nil {
return nil, err
}
}
return opBuffer, nil
}
func withIgnore(f, pattern string) (*bytes.Buffer, error) {
rfd, err := os.Open(f)
if err != nil {
log.Fatal(err)
}
defer func() {
if err := rfd.Close(); err != nil {
log.Fatal(err)
}
}()
scanner := bufio.NewScanner(rfd)
var bs []byte
buffer := bytes.NewBuffer(bs)
for scanner.Scan() {
if !bytes.Contains(scanner.Bytes(), []byte(pattern)) {
_, err := buffer.WriteString(scanner.Text() + "\n")
if err != nil {
return nil, nil
}
}
}
return buffer, nil
}
func main() {
// buff, err := withoutIgnore("base64dump.log")
buff, err := withIgnore("base64dump.log", "AUDIT")
if err != nil {
log.Fatal(err)
}
_, err = buff.WriteTo(os.Stdout)
if err != nil {
log.Fatal(err)
}
}
Benchmark test
package main
import "testing"
func BenchmarkTestWithoutIgnore(b *testing.B) {
for i := 0; i < b.N; i++ {
_, err := withoutIgnore("base64dump.log")
if err != nil {
b.Fatal(err)
}
}
}
func BenchmarkTestWithIgnore(b *testing.B) {
for i := 0; i < b.N; i++ {
_, err := withIgnore("base64dump.log", "AUDIT")
if err != nil {
b.Fatal(err)
}
}
}
and the "base64dump.log" can be generated in the command line using
base64 /dev/urandom | head -c 10000000 > base64dump.log
Since ASCII is guaranteed, one can work directly at byte level.
Still if one checks each byte for line breaks when reading the input and then searches for the pattern again within the line, operations are applied to each byte.
If, on the other hand, one reads chunks of the input and performs an optimized search for the pattern in the text, not even examining each input byte, one minimizes the operations per input byte.
For example, there is the Boyer-Moore string search algorithm. Go's built-in bytes.Index function is also optimized. The achieved speed depends of course on the input data and the actual pattern. For the input as specified in the question, `bytes.Index turned out to be significantly more performant when measured.
Procedure
read in a chunk, where the chunk size should be significantly longer than the maximum line length, a value >= 64KB should probably be good, in the test 1MB was used as in the question.
a chunk usually doesn't end at a linefeed, so search from the end of the chunk to the next linefeed, limit the search to this slice and remember the remaining data for the next pass
the last chunk does not necessarily end in a linefeed
with the help of the performant GO function bytes.Index you can find the places where the pattern occurs in the chunk
from the found location one searches for the preceding and the following linefeed
then the block is output up to the corresponding beginning of the line
and the search is continued from the end of the line where the pattern occurred
if the search does not find another location, the rest is output
read the next chunk and apply the described steps again until the end of the file is reached
Noteworthy
A read operation may return less data than the chunk size, so it makes sense to repeat the read operation until the chunk size data has been read.
Benchmark
Optimized code is often significantly more complicated, but the performance is also significantly better, as we will see in a moment.
BenchmarkTestWithoutIgnore-8 270 4137267 ns/op
BenchmarkTestWithIgnore-8 54 22403931 ns/op
BenchmarkTestFilter-8 150 7947454 ns/op
Here, the optimized code BenchmarkTestFilter-8 is only about 1.9x slower than the operation without filtering while the BenchmarkTestWithIgnore-8 method is 5.4x slower than the comparison value without filtering.
Looked at another way: the optimized code is 2.8 times faster than the unoptimized one.
Code
Of course, here is the code for your own tests:
func filterFile(f, pattern string) (*bytes.Buffer, error) {
rfd, err := os.Open(f)
if err != nil {
log.Fatal(err)
}
defer func() {
if err := rfd.Close(); err != nil {
log.Fatal(err)
}
}()
reader := bufio.NewReader(rfd)
return filter(reader, []byte(pattern), 1024*1024)
}
// chunkSize must be larger than the longest line
// a reasonable size is probably >= 64K
func filter(reader io.Reader, pattern []byte, chunkSize int) (*bytes.Buffer, error) {
var bs []byte
buffer := bytes.NewBuffer(bs)
chunk := make([]byte, chunkSize)
var remaining []byte
for lastChunk := false; !lastChunk; {
n, err := readChunk(reader, chunk, remaining, chunkSize)
if err != nil {
if err == io.EOF {
lastChunk = true
} else {
return nil, err
}
}
remaining = remaining[:0]
if !lastChunk {
for i := n - 1; i > 0; i-- {
if chunk[i] == '\n' {
remaining = append(remaining, chunk[i+1:n]...)
n = i + 1
break
}
}
}
s := 0
for s < n {
hit := bytes.Index(chunk[s:n], pattern)
if hit < 0 {
break
}
hit += s
startOfLine := hit
for ; startOfLine > 0; startOfLine-- {
if chunk[startOfLine] == '\n' {
startOfLine++
break
}
}
endOfLine := hit + len(pattern)
for ; endOfLine < n; endOfLine++ {
if chunk[endOfLine] == '\n' {
break
}
}
endOfLine++
_, err = buffer.Write(chunk[s:startOfLine])
if err != nil {
return nil, err
}
s = endOfLine
}
if s < n {
_, err = buffer.Write(chunk[s:n])
if err != nil {
return nil, err
}
}
}
return buffer, nil
}
func readChunk(reader io.Reader, chunk, remaining []byte, chunkSize int) (int, error) {
copy(chunk, remaining)
r := len(remaining)
for r < chunkSize {
n, err := reader.Read(chunk[r:])
r += n
if err != nil {
return r, err
}
}
return r, nil
}
And the benchmark part might look something like this:
func BenchmarkTestFilter(b *testing.B) {
for i := 0; i < b.N; i++ {
_, err := filterFile("base64dump.log", "AUDIT")
if err != nil {
b.Fatal(err)
}
}
}
The filter function was split and the actual job is done in func filter(reader io.Reader, pattern []byte, chunkSize int) (*bytes.Buffer, error).
By injecting a reader and a chunkSize, the creation of unit tests is already prepared or contemplated, which is missing here, but is definitely recommended when dealing with indexes.
However, the main point here was to find a way to significantly improve it in terms of performance.

How can I make a file reader function more efficiently?

I'm trying this code:
// GetFooter returns a string which is the Footer of an edi file
func GetFooter(file *os.File) (out string, err error) {
// TODO can scanner read files backwards? Seek can get us to the end of file
var lines []string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
lines = append(lines, scanner.Text())
}
line1 := lines[len(lines)-2]
line2 := lines[len(lines)-1]
return line1 + "\n" + line2, scanner.Err()
}
I'm wondering if there's a cheaper way to get the last two lines of a file?
You can keep only the last two lines in memory as you scan the buffer.
Try it on Go playground.
package main
import (
"fmt"
"bufio"
"bytes"
"strconv"
)
func main() {
var buffer bytes.Buffer
for i := 0; i < 1000; i++ {
s := strconv.Itoa(i)
buffer.WriteString(s + "\n")
}
fmt.Println(GetFooter(&buffer))
}
func GetFooter(file *bytes.Buffer) (out string, err error) {
var line1, line2 string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line1, line2 = line2, scanner.Text()
}
return line1 + "\n" + line2, scanner.Err()
}
If you know roughly the size of the last two lines, you could set SOME_NUMBER to be that size plus some extra bytes to make sure you always capture the last two, then do something like
file, err := os.Open(fileName)
if err != nil {
panic(err)
}
defer file.Close()
buf := make([]byte, SOME_NUMBER)
stat, err := os.Stat(fileName)
start := stat.Size() - SOME_NUMBER
_, err = file.ReadAt(buf, start)
if err != nil {
panic(err)
}
lines := strings.Split(string(start), "\n", -1)
lines = lines[len(lines)-2:]

Replacing a line within a file with Golang

I'm new to Golang, starting out with some examples. Currently, what I'm trying to do is reading a file line by line and replace it with another string in case it meets a certain condition.
The file is use for testing purposes contains four lines:
one
two
three
four
The code working on that file looks like this:
func main() {
file, err := os.OpenFile("test.txt", os.O_RDWR, 0666)
if err != nil {
panic(err)
}
reader := bufio.NewReader(file)
for {
fmt.Print("Try to read ...\n")
pos,_ := file.Seek(0, 1)
log.Printf("Position in file is: %d", pos)
bytes, _, _ := reader.ReadLine()
if (len(bytes) == 0) {
break
}
lineString := string(bytes)
if(lineString == "two") {
file.Seek(int64(-(len(lineString))), 1)
file.WriteString("This is a test.")
}
fmt.Printf(lineString + "\n")
}
file.Close()
}
As you can see in the code snippet, I want to replace the string "two" with "This is a test" as soon as this string is read from the file.
In order to get the current position within the file I use Go's Seek method.
However, what happens is that always the last line gets replaced by This is a test, making the file looking like this:
one
two
three
This is a test
Examining the output of the print statement which writes the current file position to the terminal, I get that kind of output after the first line has been read:
2016/12/28 21:10:31 Try to read ...
2016/12/28 21:10:31 Position in file is: 19
So after the first read, the position cursor already points to the end of my file, which explains why the new string gets appended to the end. Does anyone understand what is happening here or rather what is causing that behavior?
The Reader is not controller by the file.Seek. You have declared the reader as: reader := bufio.NewReader(file) and then you read one line at a time bytes, _, _ := reader.ReadLine() however the file.Seek does not change the position that the reader is reading.
Suggest you read about the ReadSeeker in the docs and switch over to using that. Also there is an example using the SectionReader.
Aside from the incorrect seek usage, the difficulty is that the line you're replacing isn't the same length as the replacement. The standard approach is to create a new (temporary) file with the modifications. Assuming that is successful, replace the original file with the new one.
package main
import (
"bufio"
"io"
"io/ioutil"
"log"
"os"
)
func main() {
// file we're modifying
name := "text.txt"
// open original file
f, err := os.Open(name)
if err != nil {
log.Fatal(err)
}
defer f.Close()
// create temp file
tmp, err := ioutil.TempFile("", "replace-*")
if err != nil {
log.Fatal(err)
}
defer tmp.Close()
// replace while copying from f to tmp
if err := replace(f, tmp); err != nil {
log.Fatal(err)
}
// make sure the tmp file was successfully written to
if err := tmp.Close(); err != nil {
log.Fatal(err)
}
// close the file we're reading from
if err := f.Close(); err != nil {
log.Fatal(err)
}
// overwrite the original file with the temp file
if err := os.Rename(tmp.Name(), name); err != nil {
log.Fatal(err)
}
}
func replace(r io.Reader, w io.Writer) error {
// use scanner to read line by line
sc := bufio.NewScanner(r)
for sc.Scan() {
line := sc.Text()
if line == "two" {
line = "This is a test."
}
if _, err := io.WriteString(w, line+"\n"); err != nil {
return err
}
}
return sc.Err()
}
For more complex replacements, I've implemented a package which can replace regular expression matches. https://github.com/icholy/replace
import (
"io"
"regexp"
"github.com/icholy/replace"
"golang.org/x/text/transform"
)
func replace2(r io.Reader, w io.Writer) error {
// compile multi-line regular expression
re := regexp.MustCompile(`(?m)^two$`)
// create replace transformer
tr := replace.RegexpString(re, "This is a test.")
// copy while transforming
_, err := io.Copy(w, transform.NewReader(r, tr))
return err
}
OS package has Expand function which I believe can be used to solve similar problem.
Explanation:
file.txt
one
two
${num}
four
main.go
package main
import (
"fmt"
"os"
)
var FILENAME = "file.txt"
func main() {
file, err := os.ReadFile(FILENAME)
if err != nil {
panic(err)
}
mapper := func(placeholderName string) string {
switch placeholderName {
case "num":
return "three"
}
return ""
}
fmt.Println(os.Expand(string(file), mapper))
}
output
one
two
three
four
Additionally, you may create a config (yml or json) and
populate that data in the map that can be used as a lookup table to store placeholders as well as their replacement strings and modify mapper part to use this table to lookup placeholders from input file.
e.g map will look like this,
table := map[string]string {
"num": "three"
}
mapper := func(placeholderName string) string {
if val, ok := table[placeholderName]; ok {
return val
}
return ""
}
References:
os.Expand documentation: https://pkg.go.dev/os#Expand
Playground

Count lines via bufio

I'm utilizing bufio to do a for loop for each line in a text file. I have no idea how to count the amount of lines though.
scanner := bufio.NewScanner(bufio.NewReader(file))
The above is what I use to scan my file.
You could do something like this:
counter := 0
for scanner.Scan() {
line := scanner.Text()
counter++
// do something with your line
}
fmt.Printf("Lines read: %d", counter)
Keep it simple and fast. No need for buffering, scanner already does that. Don't do unnecessary string conversions. For example,
package main
import (
"bufio"
"fmt"
"os"
)
func lineCount(filename string) (int64, error) {
lc := int64(0)
f, err := os.Open(filename)
if err != nil {
return 0, err
}
defer f.Close()
s := bufio.NewScanner(f)
for s.Scan() {
lc++
}
return lc, s.Err()
}
func main() {
filename := `testfile`
lc, err := lineCount(filename)
if err != nil {
fmt.Println(err)
return
}
fmt.Println(filename+" line count:", lc)
}
As I commented, the accepted answer fails at long lines. The default limit is bufio.MaxScanTokenSize which is 64KiB. So if your line is longer than 65536 chars, it will silently fail. You've got two options.
Call scanner.Buffer() and supply the sufficient max parameter. buffer may be small by default because Scanner is smart enough to allocate new ones. Can be a problem if you don't know the total size beforehand, like with vanilla Reader interface, and you've got huge lines - the memory consumption will grow correspondingly as Scanner records all the line.
Recreate scanner in the outer loop, this will ensure that you advance further:
var scanner *bufio.Scanner
counter := 0
for scanner == nil || scanner.Err() == bufio.ErrTooLong {
scanner = bufio.NewScanner(reader)
for scanner.Scan() {
counter++
}
}
The problem with (2) is that you keep allocating and deallocating buffers instead of reusing them. So let's fuse (1) and (2):
var scanner *bufio.Scanner
buffer := make([]byte, bufio.MaxScanTokenSize)
counter := 0
for scanner == nil || scanner.Err() == bufio.ErrTooLong {
scanner = bufio.NewScanner(reader)
scanner.Buffer(buffer, 0)
for scanner.Scan() {
counter++
}
}
Here is my approach to do the task:
inputFile, err := os.Open("input.txt")
if err != nil {
panic("Error happend during opening the file. Please check if file exists!")
os.Exit(1)
}
defer inputFile.Close()
inputReader := bufio.NewReader(inputFile)
scanner := bufio.NewScanner(inputReader)
// Count the words.
count := 0
for scanner.Scan() {
line := scanner.Text()
fmt.Printf("%v\n", line)
count++
}
if err := scanner.Err(); err != nil {
fmt.Fprintln(os.Stderr, "reading input:", err)
}
fmt.Printf("%d\n", count)

How to write to a file in golang

i am trying to write to to a file. i read the whole content of the file and now i want to change the content of the file based on some word that i have got from the file. but when i check, the content of the file, it is still the same and it has not change. this is what i used
if strings.Contains(string(read), sam) {
fmt.Println("this file contain that word")
temp := strings.ToUpper(sam)
fmt.Println(temp)
err := ioutil.WriteFile(fi.Name(), []byte(temp), 0644)
} else {
fmt.Println(" the word is not in the file")
}
Considering that your call to ioutil.WriteFile() is consistent with what is used in "Go by Example: Writing Files", this should work.
But that Go by example article check the err just after the write call.
You check the err outside the scope of your test:
if matched {
read, err := ioutil.ReadFile(path)
//fmt.Println(string(read))
fmt.Println(" This is the name of the file", fi.Name())
if strings.Contains(string(read), sam) {
fmt.Println("this file contain that word")
Value := strings.ToUpper(sam)
fmt.Println(Value)
err = ioutil.WriteFile(fi.Name(), []byte(Value), 0644)
} else {
fmt.Println(" the word is not in the file")
}
check(err) <===== too late
}
The err you are testing is the one you got when reading the file (ioutil.ReadFile), because of blocks and scope.
You need to check the error right after the Write call
err = ioutil.WriteFile(fi.Name(), []byte(Value), 0644)
check(err) <===== too late
Since WriteFile overwrite the all file, you could strings.Replace() to replace your word by its upper case equivalent:
r := string(read)
r = strings.Replace(r, sam, strings.ToUpper(sam), -1)
err := ioutil.WriteFile(fi.Name(), []byte(r), 0644)
For a replace which is case insensitive, use a regexp as in "How do I do a case insensitive regular expression in Go?".
The, use func (*Regexp) ReplaceAllString:
re := regexp.MustCompile("(?i)\\b"+sam+"\\b")
r = re.ReplaceAllString(r, strings.ToUpper(sam))
err := ioutil.WriteFile(fi.Name(), []byte(r), 0644)
Note the \b: word boundary to find the any word starting and ending with sam content (instead of finding substrings containing sam content).
If you want to replace substrings, simply drop the \b:
re := regexp.MustCompile("(?i)"+sam)
It's not clear what you want to do. My best guess is something like this:
package main
import (
"bytes"
"errors"
"fmt"
"io/ioutil"
"os"
)
func UpdateWord(filename string, data, word []byte) (int, error) {
n := 0
f, err := os.OpenFile(filename, os.O_WRONLY, 0644)
if err != nil {
return n, err
}
uWord := bytes.ToUpper(word)
if len(word) < len(uWord) {
err := errors.New("Upper case longer than lower case:" + string(word))
return n, err
}
if len(word) > len(uWord) {
uWord = append(uWord, bytes.Repeat([]byte{' '}, len(word))...)[:len(word)]
}
off := int64(0)
for {
i := bytes.Index(data[off:], word)
if i < 0 {
break
}
off += int64(i)
_, err = f.WriteAt(uWord, off)
if err != nil {
return n, err
}
n++
off += int64(len(word))
}
f.Close()
if err != nil {
return n, err
}
return n, nil
}
func main() {
// Test file
filename := `ltoucase.txt`
// Create test file
lcase := []byte(`update a bc def ghij update klmno pqrstu update vwxyz update`)
perm := os.FileMode(0644)
err := ioutil.WriteFile(filename, lcase, perm)
if err != nil {
fmt.Println(err)
return
}
// Read test file
data, err := ioutil.ReadFile(filename)
if err != nil {
fmt.Println(err)
return
}
fmt.Println(string(data))
// Update word in test file
word := []byte("update")
n, err := UpdateWord(filename, data, word)
if err != nil {
fmt.Println(n, err)
return
}
fmt.Println(filename, string(word), n)
data, err = ioutil.ReadFile(filename)
if err != nil {
fmt.Println(err)
return
}
fmt.Println(string(data))
}
Output:
update a bc def ghij update klmno pqrstu update vwxyz update
ltoucase.txt update 4
UPDATE a bc def ghij UPDATE klmno pqrstu UPDATE vwxyz UPDATE

Resources