How to read binary file in distributed manner using Erlang? - parallel-processing

I have few binary files of moderate size (1 to 5 GB) which I want to read and process using Erlang.
Each file has records of varying sizes, i.e. one record will be 200 kb but other record may be 800 kb size. Record size can be obtained by reading first few bytes of record. As it is a binary file, there's no delimiter between two records.
To process these files we can write multithreaded program but just for having fun I thought of processing files in parallel using Erlang.
I am newbie to Erlang so I don't know how I can split files into chunks and pass these chunks to Erlang processes.
Can anyone please give some ideas?

I've done this many times in other programming languages, and it was a fun learning exercise in Erlang.
The basic strategy is that you tell each process its starting byte offset, record size, and number of records to read. (If you have different-sized records in one file, you might pass a list of record sizes, like [200, 800, 800, 800, 800, 200, 100]). The workers process their chunk independently and return the result to the parent.
I think it's easy enough that you can figure it out yourself. You'll want to look at erlang:spawn, file:open, file:read, and file:position as the main pieces. But if you want the spoilers, here's my implementation of a module that reads numbers from a binary file and finds the average using multiple processes. (I'm not an Erlang expert, so there may be better ways of doing this.)
-module(average).
-export([write_file/3, read_file/4]).
-export([read_file_worker/5]).
write_file(Filename, BlockSize, ListOfNumbers) ->
BS = BlockSize*8,
BinData = [<<X:BS>> || X <- ListOfNumbers],
{ok, IoDevice} = file:open(Filename, [write, raw, binary]),
file:write(IoDevice, BinData),
file:close(IoDevice).
read_file(Filename, BlocksPerProcess, BlockSize, TotalBlockCount) ->
{ok, SpawnCount} = read_file_spawner(Filename, BlocksPerProcess, BlockSize, TotalBlockCount, 0, 0),
{ok, Sum} = read_file_listener(SpawnCount, 0),
io:format("Total sum: ~p~nNumbers seen: ~p~nAverage: ~p~n", [Sum, TotalBlockCount, Sum/TotalBlockCount]).
read_file_spawner(Filename, BlocksPerProcess, BlockSize, TotalBlockCount, BlockOffset, SpawnCount) when BlockOffset < TotalBlockCount ->
Offset = BlockOffset * BlockSize,
MaxBlocks = min(BlocksPerProcess, TotalBlockCount - BlockOffset),
spawn(?MODULE, read_file_worker, [self(), Filename, Offset, BlockSize, MaxBlocks]),
read_file_spawner(Filename, BlocksPerProcess, BlockSize, TotalBlockCount, BlockOffset + BlocksPerProcess, SpawnCount + 1);
read_file_spawner(_Filename, _BlocksPerProcess, _BlockSize, _TotalBlockCount, _BlockOffset, SpawnCount) ->
{ok, SpawnCount}.
read_file_listener(0, Accum) ->
{ok, Accum};
read_file_listener(SpawnCount, Accum) ->
receive
{ok, Number} ->
io:format("Got ~p~n", [Number]),
read_file_listener(SpawnCount - 1, Accum + Number)
end.
read_file_worker(MasterPid, Filename, Offset, BlockSize, MaxBlocks) ->
{ok, IoDevice} = file:open(Filename, [read, raw, binary]),
{ok, Offset} = file:position(IoDevice, {bof, Offset}),
{ok, Sum} = read_file_worker_loop(IoDevice, BlockSize, 0, MaxBlocks, 0),
MasterPid ! {ok, Sum}.
read_file_worker_loop(IoDevice, BlockSize, BlocksRead, MaxBlocks, Accum) when BlocksRead < MaxBlocks ->
{ok, BinData} = file:read(IoDevice, BlockSize),
Number = binary:decode_unsigned(BinData),
read_file_worker_loop(IoDevice, BlockSize, BlocksRead + 1, MaxBlocks, Accum + Number);
read_file_worker_loop(_IoDevice, _BlockSize, _BlocksRead, _MaxBlocks, Accum) ->
{ok, Accum}.
And here it is in action, reading 10 numbers from a file using 5 processes:
12> Numbers = [1,1,2,4,6,10,10,1,0,500].
[1,1,2,4,6,10,10,1,0,500]
13> average:write_file("/tmp/test.binary", 32, Numbers).
ok
14> average:read_file("/tmp/file.binary", 2, 32, length(Numbers)).
Got 500
Got 11
Got 6
Got 16
Got 2
Total sum: 535
Numbers seen: 10
Average: 53.5
ok

Why not launching a process for each file? I can't see the value of splitting small files into chunks. Your files are already small. This can be parallelized on many machines also.

Related

finding primes very slow in F#

I have answered Project Euler Question 7 very easily using Sieve of Eratosthenes in C and I had no problem with it.
I am still quite new to F# so I tried implementing the same technique
let prime_at pos =
let rec loop f l =
match f with
| x::xs -> loop xs (l |> List.filter(fun i -> i % x <> 0 || i = x))
| _ -> l
List.nth (loop [2..pos] [2..pos*pos]) (pos-1)
which works well when pos < 1000, but will crash at 10000 with out of memory exception
I then tried changing the algorithm to
let isPrime n = n > 1 && seq { for f in [2..n/2] do yield f } |> Seq.forall(fun i -> n % i <> 0)
seq {for i in 2..(10000 * 10000) do if isPrime i then yield i} |> Seq.nth 10000 |> Dump
which runs successfully but still takes a few minutes.
If I understand correctly the first algorithm is tail optimized so why does it crash? And how can I write an algorithm that runs under 1 minute (I have a fast computer)?
Looking at your first attempt
let prime_at pos =
let rec loop f l =
match f with
| x::xs -> loop xs (l |> List.filter(fun i -> i % x <> 0 || i = x))
| _ -> l
List.nth (loop [2..pos] [2..pos*pos]) (pos-1)
At each loop iteration, you are iterating over and creating a new list. This is very slow as list creation is very slow and you don't see any benefits from the cache. Several obvious optimisations such as the factor list skipping the even numbers, are skipped. When pos=10 000 you are trying to create a list which will occupy 10 000 * 10 000 * 4 = 400MB of just integers and a further 800MB of pointers (F# lists are linked lists). Futhermore, as each list element takes up a very small amount of memory there will probably be significant overhead for things like GC overhead. In the function you then create a new list of smiliar size. As a result, I am not surprised that this causes OutOfMemoryException.
Looking at the second example,
let isPrime n =
n > 1 &&
seq { for f in [2..n/2] do yield f }
|> Seq.forall(fun i -> n % i <> 0)
Here, the problem is pretty similar as you are generating giant lists for each element you are testing.
I have written a quite fast F# sieve here https://stackoverflow.com/a/12014908/124259 which shows how to do this faster.
As already mentioned by John, your implementation is slow because it generates some temporary data structures.
In the first case, you are building a list, which needs to be fully created in memory and that introduces significant overhead.
In the second case, you are building a lazy sequence, which does not consume memory (because it is build while it is being iterated), but it still introduces indirection that slows the algorithm down.
In most cases in F#, people tend to prefer readability and so using sequences is a nice way to write the code, but here you probably care more about performance, so I'd avoid sequences. If you want to keep the same structure of your code, you can rewrite isPrime like this:
let isPrime n =
let rec nonDivisible by =
if by = 1 then true // Return 'true' if we reached the end
elif n%by = 0 then false // Return 'false' if there is a divisor
else nonDivisible (by - 1) // Otherwise continue looping
n > 1 && nonDivisible (n/2)
This just replaces the sequence and forall with a recursive function nonDivisible that returns true when the number n is not divisible by any number between 2 and n/2. The function first checks the two termination cases and otherwise performs a recursive call..
With the original implementation, I'm able to find 1000th prime in 1.5sec and with the new one, it takes 22ms. Finding 10000th prime with the new implementation takes 3.2sec on my machine.

What's the ideal implementation for the Sieve of Eratosthenes between Lists, Arrays, and Mutable Arrays?

In Haskell, I've found three simple implementations of the Sieve of Eratosthenes on the Rosetta Code page.
Now my question is, which one should be used in which situations?
Correcting my initial reasoning would be helpful too:
I'm assuming the List one is the most idiomatic and easy to read for a Haskeller. Is it correct, though? I'm wondering if it suffers from the same problems as another list-based sieve that I then learned was not actually implementing the algorithm:
(edit: shown here is the list-based sieve I know has problems, not the one from RosettaCode, which I pasted at the bottom)
primes = sieve [2..] where
sieve (p:x) = p : sieve [ n | n <- x, n `mod` p > 0 ]
In terms of performance, the immutable Array seems to be the winner. With an upper bound m of 2000000, the times were about:
1.3s for List
0.6s for Array
1.8s for Mutable Array
So I'd pick Array for performance.
And of course, the Mutable Array one is also easy to reason about since I have a more imperative language background. I'm not sure why I would pick this one if I'm coding in Haskell, though, since it's both slower than the others and non-idiomatic.
Code copied here for reference:
List:
primesTo m = 2 : eratos [3,5..m] where
eratos (p : xs) | p*p>m = p : xs
| True = p : eratos (xs `minus` [p*p, p*p+2*p..])
minus a#(x:xs) b#(y:ys) = case compare x y of
LT -> x : minus xs b
EQ -> minus xs ys
GT -> minus a ys
minus a b = a
Immutable Array:
import Data.Array.Unboxed
primesToA m = sieve 3 (array (3,m) [(i,odd i) | i<-[3..m]] :: UArray Int Bool)
where
sieve p a
| p*p > m = 2 : [i | (i,True) <- assocs a]
| a!p = sieve (p+2) $ a//[(i,False) | i <- [p*p, p*p+2*p..m]]
| otherwise = sieve (p+2) a
Mutable Array:
import Control.Monad (forM_, when)
import Control.Monad.ST
import Data.Array.ST
import Data.Array.Unboxed
primeSieve :: Integer -> UArray Integer Bool
primeSieve top = runSTUArray $ do
a <- newArray (2,top) True -- :: ST s (STUArray s Integer Bool)
let r = ceiling . sqrt $ fromInteger top
forM_ [2..r] $ \i -> do
ai <- readArray a i
when ai $ do
forM_ [i*i,i*i+i..top] $ \j -> do
writeArray a j False
return a
-- Return primes from sieve as list:
primesTo :: Integer -> [Integer]
primesTo top = [p | (p,True) <- assocs $ primeSieve top]
EDIT
I showed Turner's Sieve at the top but that's not the list-based example I'm comparing with the other two. I just wanted to know if the list-based example suffers from the same "not the correct Sieve of Eratosthenes" problems as Turner's.
It appears the performance comparison is unfair because the Mutable Array example goes through evens as well and uses Integer rather than Int...
This
primes = sieve [2..] where
sieve (p:x) = p : sieve [ n | n <- x, n `mod` p > 0 ]
is not a sieve. It's very inefficient trial division. Don't use that!
I'm curious about how you got your times, there is no way that the Turner "sieve" could produce the primes not exceeding 2,000,000 in mere seconds. Letting it find the primes to 200,000 took
MUT time 6.38s ( 6.39s elapsed)
GC time 9.19s ( 9.20s elapsed)
EXIT time 0.00s ( 0.00s elapsed)
Total time 15.57s ( 15.59s elapsed)
on my box (64-bit Linux, ghc-7.6.1, compiled with -O2). The complexity of that algorithm is O(N² / log² N), almost quadratic. Letting it proceed to 2,000,000 would take about twenty minutes.
Your times for the array versions are suspicious too, though in the other direction. Did you measure interpreted code?
Sieving to 2,000,000, compiled with optimisations, the mutable array code took 0.35 seconds to run, and the immutable array code 0.12 seconds.
Now, that still has the mutable array about three times slower than the immutable array.
But, it's an unfair comparison. For the immutable array, you used Ints, and for the mutable array Integers. Changing the mutable array code to use Ints - as it should, since under the hood, arrays are Int-indexed, so using Integer is an unnecessary performance sacrifice that buys nothing - made the mutable array code run in 0.15 seconds. Close to the mutable array code, but not quite there. However, you let the mutable array do more work, since in the immutable array code you only eliminate odd multiples of the odd primes, but in the mutable array code, you mark all multiples of all primes. Changing the mutable array code to treat 2 specially, and only eliminate odd multiples of odd primes brings that down to 0.12 seconds too.
But, you're using range-checked array indexing, which is slow, and, since the validity of the indices is checked in the code itself, unnecessary. Changing that to using unsafeRead and unsafeWrite brings down the time for the immutable array to 0.09 seconds.
Then you have the problem that using
forM_ [x, y .. z]
uses boxed Ints (fortunately, GHC eliminates the list). Writing a loop yourself, so that only unboxed Int#s are used, the time goes down to 0.02 seconds.
{-# LANGUAGE MonoLocalBinds #-}
import Control.Monad (forM_, when)
import Control.Monad.ST
import Data.Array.ST
import Data.Array.Unboxed
import Data.Array.Base
primeSieve :: Int -> UArray Int Bool
primeSieve top = runSTUArray $ do
a <- newArray (0,top) True
unsafeWrite a 0 False
unsafeWrite a 1 False
let r = ceiling . sqrt $ fromIntegral top
mark step idx
| top < idx = return ()
| otherwise = do
unsafeWrite a idx False
mark step (idx+step)
sift p
| r < p = return a
| otherwise = do
prim <- unsafeRead a p
when prim $ mark (2*p) (p*p)
sift (p+2)
mark 2 4
sift 3
-- Return primes from sieve as list:
primesTo :: Int -> [Int]
primesTo top = [p | (p,True) <- assocs $ primeSieve top]
main :: IO ()
main = print .last $ primesTo 2000000
So, wrapping up, for a Sieve of Eratosthenes, you should use an array - not surprising, its efficiency depends on being able to step from one multiple to the next in short constant time.
You get very simple and straightforward code with immutable arrays, and that code performs decently for not too high limits (it gets relatively worse for higher limits, since the arrays are still copied and garbage-collected, but that's not too bad).
When you need better performance, you need mutable arrays. Writing efficient mutable array code is not entirely trivial, one has to know how the compiler translates the various constructs to choose the right one, and some would consider such code unidiomatic. But you can also use a library (disclaimer: I wrote it) that provides a fairly efficient implementation rather than writing it yourself.
Mutable array will always be the winner in terms of performance (and you really should've copied the version that works on odds only as a minimum; it should be the fastest of the three - also because it uses Int and not Integer).
For lists, tree-shaped merging incremental sieve should perform better than the one you show. You can always use it with takeWhile (< limit) if needed. I contend that it conveys the true nature of the sieve most clearly:
import Data.List (unfoldr)
primes :: [Int]
primes = 2 : _Y ((3 :) . gaps 5 . _U . map (\p -> [p*p, p*p+2*p..]))
_Y g = g (_Y g) -- recursion
_U ((x:xs):t) = (x :) . union xs . _U -- ~= nub . sort . concat
. unfoldr (\(a:b:c) -> Just (union a b, c)) $ t
gaps k s#(x:xs) | k < x = k : gaps (k+2) s -- ~= [k,k+2..]\\s, when
| otherwise = gaps (k+2) xs -- k<=x && null(s\\[k,k+2..])
union a#(x:xs) b#(y:ys) = case compare x y of -- ~= nub . sort .: (++)
LT -> x : union xs b
EQ -> x : union xs ys
GT -> y : union a ys
_U reimplements Data.List.Ordered.unionAll, and gaps 5 is (minus [5,7..]), fused for efficiency, with minus and union from the same package.
Of course nothing beats the brevity of Data.List.nubBy (((>1).).gcd) [2..] (but it's very slow).
To your 1st new question: not. It does find the multiples by counting up, as any true sieve should (although "minus" on lists is of course under-performant; the above improves on that by re-arranging a linear subtraction chain ((((xs-a)-b)-c)- ... ) into a subtraction of tree-folded additions, xs-(a+((b+c)+...))).
As has been said, using mutable arrays has the best performance. The following code is derived from this "TemplateHaskell" version converted back to something more in line with the straight mutable Array solution as the "TemplateHaskell" doesn't seem to make any difference, with some further optimizations. I believe it is faster than the usual mutable unboxed array versions due to the further optimizations and especially due to the use of the "unsafeRead" and "unsafeWrite" functions that avoid range checking of the arrays, probably also internally using pointers for array access:
{-# OPTIONS -O2 -optc-O3 #-}
import Control.Monad
import Control.Monad.ST
import Data.Array.ST
import Data.Array.Unboxed
import Data.Array.Base
primesToUA :: Word32-> [Word32]
primesToUA top = do
let sieveUA top = runSTUArray $ do
let m = ((fromIntegral top) - 3) `div` 2 :: Int
buf <- newArray (0,m) True -- :: ST s (STUArray s Int Bool)
let cullUA i = do
let p = i + i + 3
strt = p * (i + 1) + i
let cull j = do
if j > m then cullUA (i + 1)
else do
unsafeWrite buf j False
cull (j + p)
if strt > m then return ()
else do
e <- unsafeRead buf i
if e then cull strt else cullUA (i + 1)
cullUA 0; return buf
if top > 1 then 2 : [2 * (fromIntegral i) + 3 | (i,True) <- assocs $ sieveUA top]
else []
main = do
x <- read `fmap` getLine -- 1mln 2mln 10mln 100mln
print (last (primesToUA x)) -- 0.01 0.02 0.09 1.26 seconds
EDIT: The above code has been corrected and the times below edited to reflect the correction and as well the comments comparing the paged to the non-paged version have been edited.
The times to run this to the indicated top ranges are as shown in the comment table at the bottom of the above source code as measured by ideone.com and are about exactly five times faster than the answer posted by #WillNess as also measured at ideone.com. This code takes a trivial amount of time to cull the primes to two million and only 1.26 seconds to cull to a hundred million. These times are about 2.86 times faster when run on my i7 (3.5 GHz) at 0.44 seconds to a hundred million, and takes 6.81 seconds to run to one billion. The memory use is just over six megabytes for the former and sixty megabytes for the latter, which is the memory used by the huge (bit packed) array. This array also explains the non-linear performance in that as the array size exceeds the CPU cache sizes the average memory access times get worse per composite number representation cull.
EDIT_ADD: A page segmented sieve is more efficient in that it has better memory access efficiency when the buffer size is kept smaller than the L1 or L2 CPU caches, and as well has the advantage that it is unbounded so that the upper range does not have to be specified in advance and a much smaller memory footprint being just the base primes less than the square root of the range used plus the page buffer memory. The following code has been written as a page segmented implementation and is somewhat faster than the non-paged version; it also offers the advantage that one can change the output range specification at the top of the code to 'Word64' from 'Word32' so it is then not limited to the 32-bit number range, at only a slight cost in processing time (for 32-bit compiled code) for any range that is in common. The code is as follows:
-- from http://www.haskell.org/haskellwiki/Prime_numbers#Using_ST_Array
{-# OPTIONS -O2 -optc-O3 #-}
import Data.Word
import Control.Monad
import Control.Monad.ST
import Data.Array.ST
import Data.Array.Unboxed
import Data.Array.Base
primesUA :: () -> [Word32]
primesUA () = do
let pgSZBTS = 262144 * 8
let sieveUA low bps = runSTUArray $ do
let nxt = (fromIntegral low) + (fromIntegral pgSZBTS)
buf <- newArray (0,pgSZBTS - 1) True -- :: ST s (STUArray s Int Bool)
let cullUAbase i = do
let p = i + i + 3
strt = p * (i + 1) + i
when (strt < pgSZBTS) $ do
e <- unsafeRead buf i
if e then do
let loop j = do
if j < pgSZBTS then do
unsafeWrite buf j False
loop (j + p)
else cullUAbase (i + 1)
loop strt
else cullUAbase (i + 1)
let cullUA ~(p:t) = do
let bp = (fromIntegral p)
i = (bp - 3) `div` 2
s = bp * (i + 1) + i
when (s < nxt) $ do
let strt = do
if s >= low then fromIntegral (s - low)
else do
let b = (low - s) `rem` bp
if b == 0 then 0 else fromIntegral (bp - b)
let loop j = do
if j < pgSZBTS then do
unsafeWrite buf j False
loop (j + (fromIntegral p))
else cullUA t
loop strt
if low <= 0 then cullUAbase 0 else cullUA bps
return buf
let sieveList low bps = do
[2 * ((fromIntegral i) + low) + 3 | (i,True) <- assocs $ sieveUA low bps]
let sieve low bps = do
(sieveList low bps) ++ sieve (low + (fromIntegral pgSZBTS)) bps
let primes' = ((sieveList 0 []) ++ sieve (fromIntegral pgSZBTS) primes') :: [Word32]
2 : sieve 0 primes'
main = do
x <- read `fmap` getLine -- 1mln 2mln 10mln 100mln
-- 0.02 0.03 0.13 1.13 seconds
print (length (takeWhile ((>=) (fromIntegral x)) (primesUA ())))
The above code has quite a few more lines of code than the non-paged case due to the need to cull the composite number representations from the first page array differently than succeeding pages. The code also has the fixes so that there aren't memory leaks due to the base primes list and the output list now not being the same list (thus avoiding holding onto the whole list in memory).
Note that this code takes close to linear time (over the range sieved) as the range gets larger due to the culling buffer being of a constant size less than the L2 CPU cache. Memory use is a fraction of that used by the non-paged version at just under 600 kilobytes for a hundred million and just over 600 kilobytes for one billion, which slight increase is just the extra space required for the base primes less than the square root of the range list.
On ideone.com this code produces the number of primes to a hundred million in about 1.13 seconds and about 12 seconds to one billion (32-bit setting). Probably wheel factorization and definitely multi-core processing would make it even faster on a multi-core CPU. On my i7 (3.5 GHz), it takes 0.44 seconds to sieve to a hundred million and 4.7 seconds to one billion, with the roughly linear performance with increasing range as expected. It seems that there is some sort of non-linear overhead in the version of GHC run on ideone.com that has some performance penalty for larger ranges that is not present for the i7 and that is perhaps related to different garbage collection, as the page buffers are being created new for each new page. END_EDIT_ADD
EDIT_ADD2: It seems that much of the processing time for the above page segmented code is used in (lazy) list processing, so the code is accordingly reformulated with several improvements as follows:
Implemented a prime counting function that does not use list processing and uses "popCount" table look ups to count the number of 'one' bits in a 32 bit word at a time. In this way, the time to find the results is insignificant compared to the actual sieve culling time.
Stored the base primes as a list of bit packed page segments, which is much more space efficient than storing a list of primes, and the the time to convert the page segments to primes as required is not much of a computational overhead.
Tuned the prime segment make function so that for the initial zero page segment, it uses its own bit pattern as a source page, thus making the composite number cull code shorter and simpler.
The code then becomes as follows:
{-# OPTIONS -O3 -rtsopts #-} -- -fllvm ide.com doesn't support LLVM
import Data.Word
import Data.Bits
import Control.Monad
import Control.Monad.ST
import Data.Array.ST (runSTUArray)
import Data.Array.Unboxed
import Data.Array.Base
pgSZBTS = (2^18) * 8 :: Int -- size of L2 data cache
type PrimeType = Word32
type Chunk = UArray PrimeType Bool
-- makes a new page chunk and culls it
-- if the base primes list provided is empty then
-- it uses the current array as source (for zero page base primes)
mkChnk :: Word32 -> [Chunk] -> Chunk
mkChnk low bschnks = runSTUArray $ do
let nxt = (fromIntegral low) + (fromIntegral pgSZBTS)
buf <- nxt `seq` newArray (fromIntegral low, fromIntegral nxt - 1) True
let cull ~(p:ps) =
let bp = (fromIntegral p)
i = (bp - 3) `shiftR` 1
s = bp * (i + 1) + i in
let cullp j = do
if j >= pgSZBTS then cull ps
else do
unsafeWrite buf j False
cullp (j + (fromIntegral p)) in
when (s < nxt) $ do
let strt = do
if s >= low then fromIntegral (s - low)
else do
let b = (low - s) `rem` bp
if b == 0 then 0 else fromIntegral (bp - b)
cullp strt
case bschnks of
[] -> do bsbf <- unsafeFreezeSTUArray buf
cull (listChnkPrms [bsbf])
_ -> cull $ listChnkPrms bschnks
return buf
-- creates a page chunk list starting at the lw value
chnksList :: Word32 -> [Chunk]
chnksList lw =
mkChnk lw basePrmChnks : chnksList (lw + fromIntegral pgSZBTS)
-- converts a page chunk list to a list of primes
listChnkPrms :: [Chunk] -> [PrimeType]
listChnkPrms [] = []
listChnkPrms ~(hdchnk#(UArray lw _ rng _):tlchnks) =
let nxtp i =
if i >= rng then [] else
if unsafeAt hdchnk i then
(case ((lw + fromIntegral i) `shiftL` 1) + 3 of
np -> np) : nxtp (i + 1)
else nxtp (i + 1) in
(hdchnk `seq` lw `seq` nxtp 0) ++ listChnkPrms tlchnks
-- the base page chunk list used to cull the higher order pages,
-- note that it has special treatment for the zero page.
-- It is more space efficient to store this as chunks rather than
-- as a list of primes or even a list of deltas (gaps), with the
-- extra processing to convert as needed not too much.
basePrmChnks :: [Chunk]
basePrmChnks = mkChnk 0 [] : chnksList (fromIntegral pgSZBTS)
-- the full list of primes could be accessed with the following function.
primes :: () -> [PrimeType]
primes () = 2 : (listChnkPrms $ chnksList 0)
-- a quite fast prime counting up to the given limit using
-- chunk processing to avoid innermost list processing loops.
countPrimesTo :: PrimeType -> Int
countPrimesTo limit =
let lmtb = (limit - 3) `div` 2 in
let sumChnks acc chnks#(chnk#(UArray lo hi rng _):chnks') =
let cnt :: UArray PrimeType Word32 -> Int
cnt bfw =
case if lmtb < hi then fromIntegral (lmtb - lo) else rng of
crng -> case crng `shiftR` 5 of
rngw ->
let cnt' i ac =
ac `seq` if i >= rngw then
if (i `shiftL` 5) >= rng then ac else
case (-2) `shiftL` fromIntegral (lmtb .&. 31) of
msk -> msk `seq`
case (unsafeAt bfw rngw) .&.
(complement msk) of
bts -> bts `seq` case popCount bts of
c -> c `seq` case ac + c of nac -> nac
else case ac + (popCount $ unsafeAt bfw i) of
nacc -> nacc `seq` cnt' (i + 1) (nacc)
in cnt' 0 0 in
acc `seq` case runST $ do -- make UArray _ Bool into a UArray _ Word32
stbuf <- unsafeThawSTUArray chnk
stbufw <- castSTUArray stbuf
bufw <- unsafeFreezeSTUArray stbufw
return $ cnt bufw of
c -> c `seq` case acc + c of
nacc -> nacc `seq` if hi >= lmtb then nacc
else sumChnks nacc chnks' in
if limit < 2 then 0 else if limit < 3 then 1 else
lmtb `seq` sumChnks 1 (chnksList 0)
main = do
x <- read `fmap` getLine -- 1mln 2mln 10mln 100mln 1000mln
-- 0.02 0.03 0.06 0.45 4.60 seconds
-- 7328 7328 8352 8352 9424 Kilobytes
-- this takes 14.34 seconds and 9424 Kilobytes to 3 billion on ideone.com,
-- and 9.12 seconds for 3 billion on an i7-2700K (3.5 GHz).
-- The above ratio of about 1.6 is much better than usual due to
-- the extremely low memory use of the page segmented algorithm.
-- It seems thaat the Windows Native Code Generator (NCG) from GHC
-- is particularly poor, as the Linux 32-bit version takes
-- less than two thirds of the time for exactly the same program...
print $ countPrimesTo x
-- print $ length $ takeWhile ((>=) x) $ primes () -- the slow way to do this
The times and memory requirements given in the code are as observed when run on ideone.com, with 0.02, 0.03, 0.05, 0.30, 3.0, and 9.1 seconds required to run on my i7-2700K (3.5 GHz) for one, two, ten, a hundred, a thousand (one billion), and three thousand (three billion) million range, respectively, with a pretty much constant memory footprint slowly increasing with the number of base primes less than the square root of the range as required. When compiled with the LLVM compiler back end, these times become 0.01, 0.02, 0.02, 0.12, 1.35, and 4.15 seconds, respectively, due to its more efficient use of registers and machine instructions; this last is quite close to the same speed as if compiled with a 64-bit compiler rather than the 32-bit compiler used as the efficient use of registers means that availability of extra registers doesn't make much difference.
As commented in the code, the ratio between performance on my real machine and the ideone.com servers becomes much less than for much more memory wasteful algorithms due to not being throttled by memory access bottlenecks so the limit to speed is mostly just the ratio of CPU clock speeds as well as CPU processing efficiency per clock cycle. However, as commented there, there is some strange inefficiency with the GHC Native Code Generator (NCG) when run under Windows (32-bit compiler) in that the run times are over 50% slower than if run under Linux (as the ideone.com server uses). AFAIK they both have a common code base for the same Haskell GHC version and the only divergence is in the linker used (which is also used with the LLVM backend, which is not affected) as GHC NCG does not use GCC but only the mingw32 assembler, which should also be the same.
Note that this code when compiled with the LLVM compiler back end is about the same speed as the same algorithm written for highly optimized 'C/C++' implementations indicating that Haskell really has the ability to develop very tight loop coding. It might be said that the Haskell code is quite a bit more readable and secure than equivalent 'C/C++' code once one gets used to the Haskell paradigms of monadic and non-strict code. The further refinements in execution speed for the Sieve of Eratosthenes are purely a function of the tuning of the implementations used and not the choice of language between Haskell and 'C/C++'.
Summary: Of course, this isn't yet the ultimate in speed for a Haskell version of the Sieve of Eratosthenes in that we still haven't tuned the memory access to more efficiently use the fast CPU L1 cache, nor have we significantly reduced the total number of composite culling operations necessary using extreme wheel factorization other than to eliminate the odds processing. However, this is enough to answer the question in showing that mutable arrays are the most efficient way of addressing such tight loop type of problems, with potential speed gains of about 100 times over the use of lists or immutable arrays. END_EDIT_ADD2

Reading a file backwards in Mathematica -- How?

I have a large file (written in Mathematica) that contains n "records" and each of these records is a list of fixed length m, where n > 10,000 and 500 < m < 600 (bytes). Note, my system does not have the capacity to hold all records in memory --- the reason for writing them to a file. I have an application (in Mathematica) that needs to process these records in reverse order; i.e. the last record written out is the first record to be processed. How can I read these records from the file in reverse order?
Meanwhile (after some trial and error with Mathematica I/O) I found one solution. Note, this is a stripped down example of a possible solution.
fname = "testfile";
strm = OpenWrite[fname];
n = 10; (* In general, n could be very large *)
For[k = 1, k <= n, k++,
(* Create list on each pass through this loop ... *)
POt = {{k, k + 1}, {k + 2, k + 3}};
Print[POt];
(* Save to a file *)
Write[strm, POt];
];
Close[strm];
(* 2nd pass to get byte offsets of each record written to file *)
strm = OpenRead[fname];
ByteIndx = {0};
For[i = 1, i <= n, i++,
PIn = Read[strm];
AppendTo[ByteIndx, StreamPosition[strm]];
];
Drop[ByteIndx, -1]
(* Read records in reverse order *)
For[i = n, i >= 1, i--,
SetStreamPosition[strm, ByteIndx[[i]]];
PIn = Read[strm];
Print[PIn];
(* Process PIn ... *)
];
Close[strm];
It would be nice if the 2nd pass (to get the byte offsets) could be eliminated but I have not found how to do this yet... Also, these byte offsets could be written to a file (similar to how the records are handled) and then read back in one at a time, should there still be a memory problem.
for sake of putting an answer, your second pass can be written concisely:
strm = OpenRead[fname];
ByteIndx=Reap[While[Sow[StreamPosition[strm]]; !TrueQ[Read[strm ] == EndOfFile]]][[2,1,;;-2]]
n=Length[ByteIndx]

Mathematica fast 2D binning algorithm

I am having some trouble developing a suitably fast binning algorithm in Mathematica. I have a large (~100k elements) data set of the form
T={{x1,y1,z1},{x2,y2,z2},....}
and I want to bin it into a 2D array of around 100x100 bins, with the bin value being given by the sum of the Z values that fall into each bin.
Currently I am iterating through each element of the table, using Select to pick out which bin it is supposed to be in based on lists of bin boundaries, and adding the z value to a list of values occupying that bin. At the end I map Total onto the list of bins, summing their contents (I do this because I sometimes want to do other things, like maximize).
I have tried using Gather and other such functions to do this but the above method was ridiculously faster, though perhaps I am using Gather poorly. Anyway It still takes a few minutes to do the sorting by my method and I feel like Mathematica can do better. Does anyone have a nice efficient algorithm handy?
Here is a method based on Szabolcs's post that is about about an order of magnitude faster.
data = RandomReal[5, {500000, 3}];
(*500k values*)
zvalues = data[[All, 3]];
epsilon = 1*^-10;(*prevent 101 index*)
(*rescale and round (x,y) coordinates to index pairs in the 1..100 range*)
indexes = 1 + Floor[(1 - epsilon) 100 Rescale[data[[All, {1, 2}]]]];
res2 = Module[{gb = GatherBy[Transpose[{indexes, zvalues}], First]},
SparseArray[
gb[[All, 1, 1]] ->
Total[gb[[All, All, 2]], {2}]]]; // AbsoluteTiming
Gives about {2.012217, Null}
AbsoluteTiming[
System`SetSystemOptions[
"SparseArrayOptions" -> {"TreatRepeatedEntries" -> 1}];
res3 = SparseArray[indexes -> zvalues];
System`SetSystemOptions[
"SparseArrayOptions" -> {"TreatRepeatedEntries" -> 0}];
]
Gives about {0.195228, Null}
res3 == res2
True
"TreatRepeatedEntries" -> 1 adds duplicate positions up.
I intend to do a rewrite of the code below because of Szabolcs' readability concerns. Until then, know that if your bins are regular, and you can use Round, Floor, or Ceiling (with a second argument) in place of Nearest, the code below will be much faster. On my system, it tests faster than the GatherBy solution also posted.
Assuming I understand your requirements, I propose:
data = RandomReal[100, {75, 3}];
bins = {0, 20, 40, 60, 80, 100};
Reap[
Sow[{#3, #2}, bins ~Nearest~ #] & ### data,
bins,
Reap[Sow[#, bins ~Nearest~ #2] & ### #2, bins, Tr##2 &][[2]] &
][[2]] ~Flatten~ 1 ~Total~ {3} // MatrixForm
Refactored:
f[bins_] := Reap[Sow[{##2}, bins ~Nearest~ #]& ### #, bins, #2][[2]] &
bin2D[data_, X_, Y_] := f[X][data, f[Y][#2, #2~Total~2 &] &] ~Flatten~ 1 ~Total~ {3}
Use:
bin2D[data, xbins, ybins]
Here's my approach:
data = RandomReal[5, {500000, 3}]; (* 500k values *)
zvalues = data[[All, 3]];
epsilon = 1*^-10; (* prevent 101 index *)
(* rescale and round (x,y) coordinates to index pairs in the 1..100 range *)
indexes = 1 + Floor[(1 - epsilon) 100 Rescale[data[[All, {1, 2}]]]];
(* approach 1: create bin-matrix first, then fill up elements by adding zvalues *)
res1 = Module[
{result = ConstantArray[0, {100, 100}]},
Do[
AddTo[result[[##]], zvalues[[i]]] & ## indexes[[i]],
{i, Length[indexes]}
];
result
]; // Timing
(* approach 2: gather zvalues by indexes, add them up, convert them to a matrix *)
res2 = Module[{gb = GatherBy[Transpose[{indexes, zvalues}], First]},
SparseArray[gb[[All, 1, 1]] -> (Total /# gb[[All, All, 2]])]
]; // Timing
res1 == res2
These two approaches (res1 & res2) can handle 100k and 200k elements per second, respectively, on this machine. Is this sufficiently fast, or do you need to run this whole program in a loop?
Here's my approach using the function SelectEquivalents defined in What is in your Mathematica tool bag? which is perfect for a problem like this one.
data = RandomReal[100, {75, 3}];
bins = Range[0, 100, 20];
binMiddles = (Most#bins + Rest#bins)/2;
nearest = Nearest[binMiddles];
SelectEquivalents[
data
,
TagElement -> ({First#nearest[#[[1]]], First#nearest[#[[2]]]} &)
,
TransformElement -> (#[[3]] &)
,
TransformResults -> (Total[#2] &)
,
TagPattern -> Flatten[Outer[List, binMiddles, binMiddles], 1]
,
FinalFunction -> (Partition[Flatten[# /. {} -> 0], Length[binMiddles]] &)
]
If you would want to group according to more than two dimensions you could use in FinalFunction this function to give to the list result the desired dimension (I don't remember where I found it).
InverseFlatten[l_,dimensions_]:= Fold[Partition[#, #2] &, l, Most[Reverse[dimensions]]];

Slow tail recursion in F#

I have an F# function that returns a list of numbers starting from 0 in the pattern of skip n, choose n, skip n, choose n... up to a limit. For example, this function for input 2 will return [2, 3, 6, 7, 10, 11...].
Initially I implemented this as a non-tail-recursive function as below:
let rec indicesForStep start blockSize maxSize =
match start with
| i when i > maxSize -> []
| _ -> [for j in start .. ((min (start + blockSize) maxSize) - 1) -> j] # indicesForStep (start + 2 * blockSize) blockSize maxSize
Thinking that tail recursion is desirable, I reimplemented it using an accumulator list as follows:
let indicesForStepTail start blockSize maxSize =
let rec indicesForStepInternal istart accumList =
match istart with
| i when i > maxSize -> accumList
| _ -> indicesForStepInternal (istart + 2 * blockSize) (accumList # [for j in istart .. ((min (istart + blockSize) maxSize) - 1) -> j])
indicesForStepInternal start []
However, when I run this in fsi under Mono with the parameters 1, 1 and 20,000 (i.e. should return [1, 3, 5, 7...] up to 20,000), the tail-recursive version is significantly slower than the first version (12 seconds compared to sub-second).
Why is the tail-recursive version slower? Is it because of the list concatenation? Is it a compiler optimisation? Have I actually implemented it tail-recursively?
I also feel as if I should be using higher-order functions to do this, but I'm not sure exactly how to go about doing it.
As dave points out, the problem is that you're using the # operator to append lists. This is more significant performance issue than tail-recursion. In fact, tail-recursion doesn't really speed-up the program too much (but it makes it work on large inputs where the stack would overflow).
The reason why you'r second version is slower is that you're appending shorter list (the one generated using [...]) to a longer list (accumList). This is slower than appending longer list to a shorter list (because the operation needs to copy the first list).
You can fix it by collecting the elements in the accumulator in a reversed order and then reversing it before returning the result:
let indicesForStepTail start blockSize maxSize =
let rec indicesForStepInternal istart accumList =
match istart with
| i when i > maxSize -> accumList |> List.rev
| _ ->
let acc =
[for j in ((min (istart + blockSize) maxSize) - 1) .. -1 .. istart -> j]
# accumList
indicesForStepInternal (istart + 2 * blockSize) acc
indicesForStepInternal start []
As you can see, this has the shorter list (generated using [...]) as the first argument to # and on my machine, it has similar performance to the non-tail-recursive version. Note that the [ ... ] comprehension generates elements in the reversed order - so that they can be reversed back at the end.
You can also write the whole thing more nicely using the F# seq { .. } syntax. You can avoid using the # operator completely, because it allows you to yield individual elemetns using yield and perform tail-recursive calls using yield!:
let rec indicesForStepSeq start blockSize maxSize = seq {
match start with
| i when i > maxSize -> ()
| _ ->
for j in start .. ((min (start + blockSize) maxSize) - 1) do
yield j
yield! indicesForStepSeq (start + 2 * blockSize) blockSize maxSize }
This is how I'd write it. When calling it, you just need to add Seq.toList to evaluate the whole lazy sequence. The performance of this version is similar to the first one.
EDIT With the correction from Daniel, the Seq version is actually slightly faster!
In F# the list type is implemented as a singly linked list. Because of this you get different performance for x # y and y # x if x and y are of different length. That's why your seeing a difference in performance. (x # y) has running time of X.length.
// e.g.
let x = [1;2;3;4]
let y = [5]
If you did x # y then x (4 elements) would be copied into a new list and its internal next pointer would be set to the existing y list. If you did y # x then y (1 element) would be copied into a new list and its next pointer would be set to the existing list x.
I wouldn't use a higher order function to do this. I'd use list comprehension instead.
let indicesForStepTail start blockSize maxSize =
[
for block in start .. (blockSize * 2) .. (maxSize - 1) do
for i in block .. (block + blockSize - 1) do
yield i
]
This looks like the list append is the problem. Append is basically an O(N) operation on the size of the first argument. By accumulating on the left, this operation takes O(N^2) time.
The way this is typically done in functional code seems to be to accumulate the list in reverse order (by accumulating on the right), then at the end, return the reverse of the list.
The first version you have avoids the append problem, but as you point out, is not tail recursive.
In F#, probably the easiest way to solve this problem is with sequences. It is not very functional looking, but you can easily create an infinite sequence following your pattern, and use Seq.take to get the items you are interested in.

Resources