Why parallelizing this code yields almost no performance improvement on six core machine?

Why parallelizing this code yields almost no performance improvement on six core machine? - performance

I am learning parallel programming in Haskell using Simon Marlow's book. In the chapter about parallelizing Sudoku solvers, I decided to write my own solver using backtracking algorithm. The problem is that there is almost no performance gain when I try to distribute 6 cases among 6 cores. When I try to do examples with more cases, I get more significant performance gains yet still far from theoretical maximum which should be between 5 and 6. I understand that some cases may run far slower, but the threadscope diagram shows no excuse for such little gain.
Can someone explain me what I am doing wrong. Maybe there is something about ST threads which I am not understanding?
Here is the code:
Sudoku.hs
{-# LANGUAGE DeriveGeneric, DeriveAnyClass #-}
module Sudoku (getSudokus, solve) where
import Data.Vector(Vector, (!), generate, thaw, freeze)
import Data.List ( nub )
import qualified Data.Vector.Mutable as MV
import Text.Trifecta
import Control.Monad ( replicateM, when )
import Control.Applicative ((<|>))
import Control.Monad.ST
import Control.DeepSeq (NFData)
import GHC.Generics (Generic)
data Cell = Given Int
| Filled Int
| Empty
deriving (Generic, NFData)
newtype Sudoku = Sudoku (Vector Cell)
deriving (Generic, NFData)
instance Show Cell where
show Empty = " "
show (Filled x) = " " ++ show x ++ " "
show (Given x) = "[" ++ show x ++ "]"
instance Show Sudoku where
show (Sudoku vc) = "\n" ++
"+ - - - + - - - + - - - +" ++ "\n" ++
"|" ++ i 0 ++ i 1 ++ i 2 ++ "|" ++ i 3 ++ i 4 ++ i 5 ++ "|" ++ i 6 ++ i 7 ++ i 8 ++ "|" ++ "\n" ++
"|" ++ i 9 ++ i 10 ++ i 11 ++ "|" ++ i 12 ++ i 13 ++ i 14 ++ "|" ++ i 15 ++ i 16 ++ i 17 ++ "|" ++ "\n" ++
"|" ++ i 18 ++ i 19 ++ i 20 ++ "|" ++ i 21 ++ i 22 ++ i 23 ++ "|" ++ i 24 ++ i 25 ++ i 26 ++ "|" ++ "\n" ++
"+ - - - + - - - + - - - +" ++ "\n" ++
"|" ++ i 27 ++ i 28 ++ i 29 ++ "|" ++ i 30 ++ i 31 ++ i 32 ++ "|" ++ i 33 ++ i 34 ++ i 35 ++ "|" ++ "\n" ++
"|" ++ i 36 ++ i 37 ++ i 38 ++ "|" ++ i 39 ++ i 40 ++ i 41 ++ "|" ++ i 42 ++ i 43 ++ i 44 ++ "|" ++ "\n" ++
"|" ++ i 45 ++ i 46 ++ i 47 ++ "|" ++ i 48 ++ i 49 ++ i 50 ++ "|" ++ i 51 ++ i 52 ++ i 53 ++ "|" ++ "\n" ++
"+ - - - + - - - + - - - +" ++ "\n" ++
"|" ++ i 54 ++ i 55 ++ i 56 ++ "|" ++ i 57 ++ i 58 ++ i 59 ++ "|" ++ i 60 ++ i 61 ++ i 62 ++ "|" ++ "\n" ++
"|" ++ i 63 ++ i 64 ++ i 65 ++ "|" ++ i 66 ++ i 67 ++ i 68 ++ "|" ++ i 69 ++ i 70 ++ i 71 ++ "|" ++ "\n" ++
"|" ++ i 72 ++ i 73 ++ i 74 ++ "|" ++ i 75 ++ i 76 ++ i 77 ++ "|" ++ i 78 ++ i 79 ++ i 80 ++ "|" ++ "\n" ++
"+ - - - + - - - + - - - +" ++ "\n"
where i x = show (vc ! x)
parseSudoku :: Parser Sudoku
parseSudoku = do
lst <- replicateM 81 field
(newline *> return ()) <|> eof
return $ Sudoku $ generate 81 (lst !!)
where field = (char '.' >> return Empty) <|> (Given . read . return <$> digit)
getSudokus :: String -> Maybe [Sudoku]
getSudokus raw = case parseString (some parseSudoku) mempty raw of
Success ss -> Just ss
Failure _ -> Nothing
data Direction = Back | Forward
solve :: Sudoku -> Maybe Sudoku
solve sudoku#(Sudoku puzzle) = if isValid sudoku then
Just $ runST $ do
puzzle' <- thaw puzzle
go puzzle' 0 Forward
Sudoku <$> freeze puzzle'
else Nothing
where go _ 81 _ = return ()
go vector position direction = do
cell <- MV.read vector position
case (cell, direction) of
(Empty, Back) -> error "Calling back Empty cell, this should not ever occur"
(Empty, Forward) -> MV.write vector position (Filled 1) >> go vector position Forward
(Given _, Back) -> go vector (position-1) Back
(Given _, Forward) -> go vector (position+1) Forward
(Filled 10, Back) -> MV.write vector position Empty >> go vector (position-1) Back
(Filled 10, Forward) -> go vector position Back
(Filled x, Forward) -> do
let (r, c, s) = calculatePositions position
row <- getRowMV r vector
col <- getColumnMV c vector
sqr <- getSquareMV s vector
if isUnique row && isUnique col && isUnique sqr
then go vector (position+1) Forward
else MV.write vector position (Filled (x+1)) >> go vector position Forward
(Filled x, Back) -> MV.write vector position (Filled (x+1)) >> go vector position Forward
calculatePositions :: Int -> (Int, Int, Int)
calculatePositions i = let (row, col) = divMod i 9
sqr = (row `div` 3)*3 + (col `div` 3)
in (row, col, sqr)
isValid :: Sudoku -> Bool
isValid sudoku = go 0
where go 9 = True
go i = isUnique (getRow i sudoku) && isUnique (getColumn i sudoku) && isUnique (getSquare i sudoku) && go (i+1)
getRow :: Int -> Sudoku -> [Cell]
getRow l (Sudoku vector) = go 0
where go 9 = []
go c = vector ! (l*9 + c) : go (c+1)
getRowMV :: MV.PrimMonad m => Int -> MV.MVector (MV.PrimState m) Cell -> m [Cell]
getRowMV l mv = go 0
where go 9 = return []
go c = do
n <- MV.read mv (l*9 + c)
rl <- go (c+1)
return (n:rl)
getColumn :: Int -> Sudoku -> [Cell]
getColumn c (Sudoku vector) = go 0
where go 9 = []
go i = vector ! (c + i*9) : go (i+1)
getColumnMV :: MV.PrimMonad m => Int -> MV.MVector (MV.PrimState m) Cell -> m [Cell]
getColumnMV c mv = go 0
where go 9 = return []
go i = do
n <- MV.read mv (c + i*9)
rl <- go (i+1)
return (n:rl)
getSquare :: Int -> Sudoku -> [Cell]
getSquare q (Sudoku vector) = let (y, x) = quotRem q 3
start = x*3 + y*3*9
in [ vector ! start, vector ! (start + 1), vector ! (start + 2)
, vector ! (start + 9), vector ! (start + 10), vector ! (start + 11)
, vector ! (start + 18), vector ! (start + 19), vector ! (start + 20)]
getSquareMV :: MV.PrimMonad m => Int -> MV.MVector (MV.PrimState m) a -> m [a]
getSquareMV q mv = let (y, x) = quotRem q 3
start = x*3 + y*3*9
in do
a1 <- MV.read mv start
a2 <- MV.read mv (start + 1)
a3 <- MV.read mv (start + 2)
b1 <- MV.read mv (start + 9)
b2 <- MV.read mv (start + 10)
b3 <- MV.read mv (start + 11)
c1 <- MV.read mv (start + 18)
c2 <- MV.read mv (start + 19)
c3 <- MV.read mv (start + 20)
return [a1,a2,a3,b1,b2,b3,c1,c2,c3]
isUnique :: [Cell] -> Bool
isUnique xs = let sv = strip xs
in length sv == length (nub sv)
where strip (Empty:xs) = strip xs
strip ((Given x):xs) = x : strip xs
strip ((Filled x):xs) = x : strip xs
strip [] = []
Main.hs
module Main where
import Control.Parallel.Strategies
import Control.Monad
import Control.DeepSeq ( force )
import Sudoku
import System.Environment (getArgs)
main :: IO ()
main = do
filename <- head <$> getArgs
contents <- readFile filename
case getSudokus contents of
Just sudokus -> print $ runEval $ do
start <- forM sudokus (rpar . force . solve)
forM start rseq
Nothing -> putStrLn "Error during parsing"
I am compiling it with following flags:
ghc-options: -O2 -rtsopts -threaded -eventlog
Execution with following flags:
cabal exec sudoku -- sudoku17.6.txt +RTS -N1 -s -l
Gives following performance report and threadscope diagram:
950,178,477,200 bytes allocated in the heap
181,465,696 bytes copied during GC
121,832 bytes maximum residency (7 sample(s))
30,144 bytes maximum slop
7 MiB total memory in use (0 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max pause
Gen 0 227776 colls, 0 par 1.454s 1.633s 0.0000s 0.0011s
Gen 1 7 colls, 0 par 0.001s 0.001s 0.0001s 0.0002s
TASKS: 4 (1 bound, 3 peak workers (3 total), using -N1)
SPARKS: 6 (0 converted, 0 overflowed, 0 dud, 0 GC'd, 6 fizzled)
INIT time 0.001s ( 0.001s elapsed)
MUT time 220.452s (220.037s elapsed)
GC time 1.455s ( 1.634s elapsed)
EXIT time 0.000s ( 0.008s elapsed)
Total time 221.908s (221.681s elapsed)
Alloc rate 4,310,140,685 bytes per MUT second
Productivity 99.3% of total user, 99.3% of total elapsed
Execution with parallelization:
cabal exec sudoku -- sudoku17.6.txt +RTS -N6 -s -l
950,178,549,616 bytes allocated in the heap
325,450,104 bytes copied during GC
142,704 bytes maximum residency (7 sample(s))
82,088 bytes maximum slop
32 MiB total memory in use (0 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max pause
Gen 0 128677 colls, 128677 par 37.697s 30.612s 0.0002s 0.0035s
Gen 1 7 colls, 6 par 0.005s 0.004s 0.0006s 0.0012s
Parallel GC work balance: 11.66% (serial 0%, perfect 100%)
TASKS: 14 (1 bound, 13 peak workers (13 total), using -N6)
SPARKS: 6 (5 converted, 0 overflowed, 0 dud, 0 GC'd, 1 fizzled)
INIT time 0.010s ( 0.009s elapsed)
MUT time 355.227s (184.035s elapsed)
GC time 37.702s ( 30.616s elapsed)
EXIT time 0.001s ( 0.007s elapsed)
Total time 392.940s (214.667s elapsed)
Alloc rate 2,674,847,755 bytes per MUT second
Productivity 90.4% of total user, 85.7% of total elapsed
Here are the contents of sudoku17.6.txt:
.......2143.......6........2.15..........637...........68...4.....23........7....
.......241..8.............3...4..5..7.....1......3.......51.6....2....5..3...7...
.......24....1...........8.3.7...1..1..8..5.....2......2.4...6.5...7.3...........
.......23.1..4....5........1.....4.....2...8....8.3.......5.16..4....7....3......
.......21...5...3.4..6.........21...8.......75.....6.....4..8...1..7.....3.......
.......215.3......6...........1.4.6.7.....5.....2........48.3...1..7....2........

Believe it or not, but your problem potentially had nothing to do with parallelization. In the future I'd recommend you first look at the input to the function you are trying to parallelized. It turned out you always tried a single puzzle.
Edit - #Noughtmare pointed out that according to Threadscope results posted in the question there is some parallelization going on. Which is true and it makes me believe that the file posted in question doesn't exactly match the one used for creating the results. If that's the case, then you can skip to Parallelization section for the answer about: "Why parallelizing this code yields almost no performance improvement on six core machine?"
Parser
Long story short there is a bug in your parser. If you ask my true opinion, it is actually a bug in trifecta package documentation, because it promises to fully consume the input parseString:
Fully parse a String to a Result.
but instead it consumes the first line only and successfully returns the result. However, honestly, I've never used it before, so maybe it is the expected bahavior.
Lets take a look at your parser:
parseSudoku :: Parser Sudoku
parseSudoku = do
lst <- replicateM 81 field
(newline *> return ()) <|> eof
return $ Sudoku $ generate 81 (lst !!)
where
field = (char '.' >> return Empty) <|> (Given . read . return <$> digit)
At first glance it looks just fine, until input is closely examined. Every empty line between the lines with data also contain a newline character, but your parser expects one at most:
.......2143.......6........2.15..........637...........68...4.....23........7....
<this is also a newline>
.......241..8.............3...4..5..7.....1......3.......51.6....2....5..3...7...
So you parser should instead be:
many (newline *> return ()) <|> eof
Side note. If it was up to me this is how I would write the parser:
parseSudoku :: Parser Sudoku
parseSudoku = do
(Sudoku <$> V.replicateM 81 field) <* ((() <$ many newline) <|> eof)
where
field = (Empty <$ char '.') <|> (Given . Data.Char.digitToInt <$> digit)
Parallelization
When it comes to implementation of parallelization it seems to work fine, but the problem is the work load is really unbalanced. That's why there is only about x2 speed up when using 6 cores. In other words not all puzzles are created equally hard. For that reason solving 6 puzzles using 6 cores in parallel will always get the performance of the longest solution at best. Therefore to gain more from parallelization you either need more puzzles or less CPU cores ;)
EDIT: Here are some benchmarks to support my explanation above.
These are the results for solving each individual puzzle:
And these two are the sequential and parallelized solvers using one core and six cores respectfully.
As you can see solving the second puzzle with index 1 took the longest time, which on my computer took a little over a 100 seconds. This is also the time it took for the parallelized algorithm to solve all puzzles. Which makes sense, since all other 5 puzzles were solved much quicker and those cores that were freed up had no other work to do.
Also as a sanity check if you sum up the individual times it took for puzzles to be solved it will match up pretty good with the total time it took to solve all of them sequentially.

Related

Improve performance of finding graph diameter in Haskell

I'm solving the following problem, which in essence is "find the diameter of a connected undirected weighted graph", in Haskell. Now, the solution below produces correct answers, but exceeds the time limit on 9/27 of the tests. I'm far from a Haskell prodigy, can you guys give me a clue whether and how I can improve the performance of my solution without using the builtin Data.Graph module? I tried using accumulator parameters, strict pairs and strict evaluation in some places, but either I used them incorrectly or the performance issue is elsewhere. Thanks in advance!
import qualified Data.Map as Map
import qualified Data.Set as Set
import Data.List (maximumBy)
import Data.Ord (comparing)
buildGraph :: [Int] -> Map.Map Int [(Int, Int)] -> Map.Map Int [(Int, Int)]
buildGraph [] acc = acc
buildGraph (from:to:dist:rest) acc = let withTo = Map.insertWith (++) from [(to, dist)] acc
withFromTo = Map.insertWith (++) to [(from, dist)] withTo
in buildGraph rest $ withFromTo
data Queue a = Queue {
ingoing :: [a]
, outgoing :: [a]
} deriving Show
toQueue xs = Queue [] xs
enqMany xs (Queue is os) = (Queue (reverse xs ++ is) os)
deq (Queue is []) = deq (Queue [] $ reverse is)
deq (Queue is (o:os)) = (o, Queue is os)
extract :: (Ord a) => a -> Map.Map a [b] -> [b]
extract k m = case Map.lookup k m of
Just value -> value
Nothing -> error "sdfsd" -- should never happen
bfs node graph = bfs' Set.empty (toQueue [(node, 0)]) []
where
bfs' :: Set.Set Int -> Queue (Int, Int) -> [(Int, Int)] -> [(Int, Int)]
bfs' visited (Queue [] []) acc = acc
bfs' visited que acc = let ((n, dist), rest) = deq que
in if Set.member n visited
then bfs' visited rest acc
else let children = map (\(i, d) -> (i, d + dist)) $ extract n graph
newNodes = enqMany children rest
in bfs' (Set.insert n visited) newNodes ((n, dist):acc)
findMostDistant xs = maximumBy (comparing snd) xs
solve input = answer
where
-- the first number is the number of edges and is not necessary
(_:triples) = map read $ words input
graph = buildGraph triples Map.empty
-- pick arbitary node, find the farther node from it using bfs
(mostDistant, _) = findMostDistant $ bfs (head triples) graph
-- find the farthest node from the previously farthest node, counting the distance on the way
(_, answer) = findMostDistant $ bfs mostDistant graph
tests = [
"11 2 7 2 1 7 6 5 1 8 2 8 6 8 6 9 10 5 5 9 1 9 0 10 15 3 1 21 6 4 3" -- 54
, "5 3 4 3 0 3 4 0 2 6 1 4 9" -- 22
, "16 2 3 92 5 2 10 14 3 42 2 4 26 14 12 50 4 6 93 9 6 24 15 14 9 0 2 95 8 0 90 0 13 60 9 10 59 1 0 66 11 12 7 7 10 35" -- 428
]
runZeroTests = mapM_ print $ map solve tests
main = do
answer <- solve <$> getContents
print answer

deq (Queue [] []) causes an infinite loop, I think.

When I’ve solved contest problems in Haskell, typically the biggest performance hog has been the slow I/O library, which operates on lazy linear linked lists of wide characters. The first thing I always do for a programming contest is replace that with fast I/O,
Here’s a version that makes minimal changes to the program logic and just replaces the I/O with Data.ByteString.Lazy.Char8, implemented with a lazily-evaluated list of strict byte arrays, and Data.ByteString.Builder, which builds a function to fill an output buffer. It should be useful to calculate the speed-up from fast I/O alone.
{-# LANGUAGE OverloadedStrings #-} -- Added
import Data.ByteString.Builder
(Builder, char7, intDec, toLazyByteString) -- Added
import qualified Data.ByteString.Lazy.Char8 as B8 -- Added
import qualified Data.Map as Map
import qualified Data.Set as Set
import Data.List (maximumBy)
import Data.Maybe (fromJust) -- Added
import Data.Monoid ((<>)) -- Added
import Data.Ord (comparing)
buildGraph :: [Int] -> Map.Map Int [(Int, Int)] -> Map.Map Int [(Int, Int)]
buildGraph [] acc = acc
buildGraph (from:to:dist:rest) acc = let withTo = Map.insertWith (++) from [(to, dist)] acc
withFromTo = Map.insertWith (++) to [(from, dist)] withTo
in buildGraph rest $ withFromTo
data Queue a = Queue {
ingoing :: [a]
, outgoing :: [a]
} deriving Show
toQueue xs = Queue [] xs
enqMany xs (Queue is os) = (Queue (reverse xs ++ is) os)
deq (Queue is []) = deq (Queue [] $ reverse is)
deq (Queue is (o:os)) = (o, Queue is os)
extract :: (Ord a) => a -> Map.Map a [b] -> [b]
extract k m = case Map.lookup k m of
Just value -> value
Nothing -> error "sdfsd" -- should never happen
bfs node graph = bfs' Set.empty (toQueue [(node, 0)]) []
where
bfs' :: Set.Set Int -> Queue (Int, Int) -> [(Int, Int)] -> [(Int, Int)]
bfs' visited (Queue [] []) acc = acc
bfs' visited que acc = let ((n, dist), rest) = deq que
in if Set.member n visited
then bfs' visited rest acc
else let children = map (\(i, d) -> (i, d + dist)) $ extract n graph
newNodes = enqMany children rest
in bfs' (Set.insert n visited) newNodes ((n, dist):acc)
findMostDistant xs = maximumBy (comparing snd) xs
solve triples = answer -- Changed (by deleting one line)
where
graph = buildGraph triples Map.empty
-- pick arbitary node, find the farther node from it using bfs
(mostDistant, _) = findMostDistant $ bfs (head triples) graph
-- find the farthest node from the previously farthest node, counting the distance on the way
(_, answer) = findMostDistant $ bfs mostDistant graph
tests = [ -- Unchanged, but now interpreted as OverloadedStrings
"11 2 7 2 1 7 6 5 1 8 2 8 6 8 6 9 10 5 5 9 1 9 0 10 15 3 1 21 6 4 3" -- 54
, "5 3 4 3 0 3 4 0 2 6 1 4 9" -- 22
, "16 2 3 92 5 2 10 14 3 42 2 4 26 14 12 50 4 6 93 9 6 24 15 14 9 0 2 95 8 0 90 0 13 60 9 10 59 1 0 66 11 12 7 7 10 35" -- 428
]
runZeroTests = B8.putStr -- Changed
. toLazyByteString
. foldMap format
. map (solve . parse)
$ tests
main :: IO () -- Changed
main = B8.interact ( toLazyByteString . format . solve . parse )
parse :: B8.ByteString -> [Int] -- Added
-- the first number is the number of edges and is not necessary
parse = map (fst . fromJust . B8.readInt) . tail . B8.words
format :: Int -> Builder -- Added
format n = intDec n <> eol where
eol = char7 '\n'

With help from #Davislor with doing IO using ByteString and a few other things I managed to get 100 points on the problem. In the end, what I did to optimize it was:
Using ByteString IO as #Davislor suggested
Since I knew integers in the input were valid, I wrote my own parseInt function that does not perform unnecessary checks.
Instead of lazy Map, I used Array to create an adjacency list. I do not know what the asymptotic complexity of constructing an Array using accumArray is (I believe it should be O(n)), but lookup in the array should be O(1), instead of the O(log n) for the Map.
Here is the final solution:
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE BangPatterns #-}
import Data.ByteString.Builder
(Builder, char7, intDec, toLazyByteString)
import qualified Data.ByteString.Lazy.Char8 as B8
import qualified Data.Set as Set
import Data.Monoid ((<>))
import Data.Char (ord)
import Data.ByteString (getLine)
import Data.Array (Array, array, accumArray, (!), (//))
buildAdjList :: Int -> [Int] -> Array Int [(Int, Int)]
buildAdjList n xs = accumArray (flip (:)) [] (0, n) $ triples xs []
where
triples [] res = res
triples (x:y:dist:rest) res = let edgeXY = (x, (y, dist))
edgeYX = (y, (x, dist))
in triples rest (edgeXY:edgeYX:res)
data Queue a = Queue {
ingoing :: [a]
, outgoing :: [a]
} deriving Show
enqMany xs (Queue is os) = Queue (reverse xs ++ is) os
deq (Queue [] []) = error "gosho"
deq (Queue is []) = deq (Queue [] $ reverse is)
deq (Queue is (o:os)) = (o, Queue is os)
bfs !node adjList = let start = (node, 0) in bfs' Set.empty (Queue [] [start]) start
where
bfs' :: Set.Set Int -> Queue (Int, Int) -> (Int, Int) -> (Int, Int)
bfs' visited (Queue [] []) !ans = ans
bfs' visited que !ans = let (curr#(n, dist), rest) = deq que
in if Set.member n visited
then bfs' visited rest ans
else let children = map (\(i, d) -> (i, d + dist)) $ adjList ! n
newNodes = enqMany children rest
in bfs' (Set.insert n visited) newNodes (longerEdge curr ans)
longerEdge :: (Int, Int) -> (Int, Int) -> (Int, Int)
longerEdge a b = if (snd a) < (snd b) then b else a
parseInt :: B8.ByteString -> Int
parseInt str = parseInt' str 0 where
parseInt' str !acc
| B8.null str = acc
| otherwise = parseInt' (B8.tail str) $ ((ord $ B8.head str) - 48 + acc * 10)
parseIntList :: B8.ByteString -> [Int]
parseIntList = map parseInt . B8.words
solve :: [Int] -> Int
solve (n:triples) = answer
where
graph = buildAdjList n triples
-- pick arbitary node, find the farther node from it using bfs
(mostDistant, _) = bfs (head triples) graph
-- find the farthest node from the previously farthest node, counting the distance on the way
(_, answer) = bfs mostDistant graph
main :: IO ()
main = B8.interact ( toLazyByteString . intDec . solve . parseIntList )
-- debug code below
tests = [
"11 2 7 2 1 7 6 5 1 8 2 8 6 8 6 9 10 5 5 9 1 9 0 10 15 3 1 21 6 4 3" -- 54
, "5 3 4 3 0 3 4 0 2 6 1 4 9" -- 22
, "16 2 3 92 5 2 10 14 3 42 2 4 26 14 12 50 4 6 93 9 6 24 15 14 9 0 2 95 8 0 90 0 13 60 9 10 59 1 0 66 11 12 7 7 10 35" -- 428
]
runZeroTests = B8.putStr
. toLazyByteString
. foldMap format
. map (solve . parseIntList)
$ tests
format :: Int -> Builder
format n = intDec n <> eol
where eol = char7 '\n'
There could still be room for improvement, the Set for visited nodes could be changed to a bit array, Int32 can be used instead of Int, BangPatterns could be applied, although I feel like I can't really make sense of the execution order of Haskell programs.

Haskell: How to change algorithm to work on any size of list?

I have this code:
project= [
[(a,b),(c,d),(e,f)]
|
a<-[1..5],
b<-[1..3],
c<-[1..5],
d<-[1..3],
e<-[1..5],
f<-[1..3]
, a*b + c*d + e*f <6
, a + c + e == 5
, b == 3 || d==3 || f==3
]
x=take 1 project
main = print $ x
it is return a list of 3 pairs [(x,y),(x,y),(x,y)] .
There are 3 conditions:
If you sum all the x you must get 5.
If you sum all the x*y you will get less than 6.
There is at least one y that equal to 3.
Now, I want exactly the same algorithm to work for any longer list for example 10 pairs. How should I do that?

Here:
project n =
[ x
| x <- replicateM n $ liftA2 (,) [1..5] [1..3]
, sum (map (uncurry (*)) x) < 6
, sum (map fst x) == 5
, any ((==3) . snd) x
]
main = print $ take 1 $ project 3
Or like so:
project n
= filter (any ((==3) . snd))
$ filter ((==5) . sum . map fst)
$ filter ((<6) . sum . map (uncurry (*)))
$ replicateM n
$ liftA2 (,) [1..5] [1..3]

Trying to create an efficient algorithm for a function in Haskell

I'm looking for an efficient polynomial-time solution to the following problem:
Implement a recursive function node x y for calculating the (x,y)-th number in a number triangle defined as
g(x,y) = 0 if |x| > y
= 1 if (x,y) = (0,0)
= sum of all incoming paths otherwise
The sum of all incoming paths to a node is defined as the sum of the values of all possible paths from the root node (x, y) = (0, 0) to the node under consideration, where at each node (x,y) a path can either continue diagonally down and left (x−1,y+1), straight down (x,y+1), or diagonally down and right (x+1,y+1). The value of a path to a node is defined as the sum of all the nodes along that path up to, but not including, the node under consideration.
The first few entries in the number triangle are given in the table:
\ x -3 -2 -1 0 1 2 3
\
y \ _________________________
|
0 | 0 0 0 1 0 0 0
|
1 | 0 0 1 1 1 0 0
|
2 | 0 2 4 6 4 2 0
|
3 | 4 16 40 48 40 16 4
I am trying to work out a naive solution first, here is what I have:
node x y | y < 0 = error "number cannot be negative"
| (abs x) > y = 0
| (x == 0) && (y == 0) = 1
| otherwise = node (x+1) (y-1) + node x (y-1) + node (x-1) (y-1)
Whenever I run this I get:
"* Exception: stack overflow"?

I believe your problem is a bit more complicated than your example code suggests. First, let's be clear about some definitions here:
Let pathCount x y be the number of paths that end at (x, y). We have
pathCount :: Int -> Int -> Integer
pathCount x y
| y == 0 = if x == 0 then 1 else 0
| otherwise = sum [ pathCount (x + d) (y - 1) | d <- [-1..1]]
Now let's pathSum x y be the sum of all paths that end in (x, y). We have:
pathSum :: Int -> Int -> Integer
pathSum x y
| y == 0 = if x == 0 then 1 else 0
| otherwise = sum [ pathSum (x + d) (y - 1) + node x y * pathCount (x + d) (y - 1)
| d <- [-1..1] ]
With this helper, we can finally define node x y properly:
node :: Int -> Int -> Integer
node x y
| y == 0 = if x == 0 then 1 else 0
| otherwise = sum [ pathSum (x + d) (y - 1) | d <- [-1..1]]
This algorithm as such is exponential time in its current form. We can however add memoization to make the number of additions quadratic. The memoize package on Hackage makes this easy as pie. Full example:
import Control.Monad
import Data.List (intercalate)
import Data.Function.Memoize (memoize2)
node' :: Int -> Int -> Integer
node' x y
| y == 0 = if x == 0 then 1 else 0
| otherwise = sum [ pathSum (x + d) (y - 1) | d <- [-1..1]]
node = memoize2 node'
pathCount' :: Int -> Int -> Integer
pathCount' x y
| y == 0 = if x == 0 then 1 else 0
| otherwise = sum [ pathCount (x + d) (y - 1) | d <- [-1..1]]
pathCount = memoize2 pathCount'
pathSum' :: Int -> Int -> Integer
pathSum' x y
| y == 0 = if x == 0 then 1 else 0
| otherwise = sum [ pathSum (x + d) (y - 1) + node x y * pathCount (x + d) (y - 1)
| d <- [-1..1] ]
pathSum = memoize2 pathSum'
main =
forM_ [0..n] $ \y ->
putStrLn $ intercalate " " $ map (show . flip node y) [-n..n]
where n = 5
Output:
0 0 0 0 0 1 0 0 0 0 0
0 0 0 0 1 1 1 0 0 0 0
0 0 0 2 4 6 4 2 0 0 0
0 0 4 16 40 48 40 16 4 0 0
0 8 72 352 728 944 728 352 72 8 0
16 376 4248 16608 35128 43632 35128 16608 4248 376 16
As you can see the algorithm the size of the numbers will get out of hands rather quickly. So the runtime is not O(n^2), while the number of arithmetic operations is.

You're thinking in terms of outgoing paths, when you should be thinking in terms of incoming paths. Your recursive step is currently looking for nodes from below, instead of above.

First of all, sorry if this is long. I wanted to explain the step by step thought process.
To start off with, you need one crucial fact: You can represent the "answer" at each "index" by a list of paths. For all the zeros, this is [[]], for your base case it is [[1]], and for example, for 0,2 it is [[6,1,1],[6,1,1],[6,1,1]]. This may seem like some redundancy, but it simplifies things down the road. Then, extracting the answer is head . head if the list is non empty, or const 0 if it is.
This is very useful because you can store the answer as a list of rows (the first row would be '[[1]], [], [] ...) and the results of any given row depend only on the previous row.
Secondly, this problem is symmetrical. This is pretty obvious.
The first thing we will do will mirror the definition of fib very closely:
type Path = [[Integer]]
triangle' :: [[Path]]
triangle' = ([[1]] : repeat []) : map f triangle'
We know this must be close to correct, since the 2nd row will depend on the first row only, the third on the 2nd only, etc. So the result will be
([[1]] : repeat []) : f ([[1]] : repeat []) : f ....
Now we just need to know what f is. Firstly, its type: [Path] -> [Path]. Quite simply, given the previous row, return the next row.
Now you may see another problem arising. Each invocation of f needs to know how many columns in the current row. We could actually count the length of non-null elements in the previous row, but it is simpler to pass the parameter directly, so we change map f triangle' to zipWith f [1..] triangle', giving f the type Int -> [Path] -> [Path].
f needs to handle one special case and one general case. The special case is x=0, in this case we simply treat the x+1,y-1 and x-1,y-1 recursions the same, and otherwise is identical to gn. Lets make two functions, g0 and gn which handle these two cases.
The actually computation of gn is easy. We know for some x we need the elements x-1, x, x+1 of the previous row. So if we drop x-1 elements before giving the previous row to the xth invocation of gn, gn can just take the first 3 elements and it will have what it needs. We write this as follows:
f :: Int -> [Path] -> [Path]
f n ps = g0 ps : map (gn . flip drop ps) [0..n-1] ++ repeat []
The repeat [] at the end should be obvious: for indices outside the triangle, the result is 0.
Now writing g0 and gs is really quite simple:
g0 :: [Path] -> Path
g0 (a:b:_) = map (s:) q
where
s = sum . concat $ q
q = b ++ a ++ b
gn :: [Path] -> Path
gn (a:b:c:_) = map (s:) q
where
s = sum . concat $ q
q = a ++ b ++ c
On my machine this version is about 3-4 times faster than the fastest version I could write with normal recursion and memoization.
The rest is just printing or pulling out the number you want.
triangle :: Int -> Int -> Integer
triangle x y = case (triangle' !! y) !! (abs x) of
[] -> 0
xs -> head $ head xs
triList :: Int -> Int -> Path
triList x y = (triangle' !! y) !! (abs x)
printTri :: Int -> Int -> IO ()
printTri width height =
putStrLn $ unlines $ map unwords
[[ p $ triangle x y | x <- [-x0..x0]] | y <- [0..height]]
where maxLen = length $ show $ triangle 0 height
x0 = width `div` 2
p = printf $ "%" ++ show maxLen ++ "d "

How do you find the definition of a function when all you have is a huge set of input/ouput pairs?

Suppose that you were given a list of input/ouput pairs:
f 0 = 0
f 1 = 2
f 2 = 1
f 3 = -1
f 4 = 0
f 5 = 0
f 6 = -76
f 7 = -3
f 8 = 3
f 9 = -1
f 10 = -1
f 11 = -6
f 12 = -1
f 13 = -1
f 14 = 4
f 15 = -2
f 16 = -10
f 17 = 0
f 18 = 0
f 19 = -1
f 20 = 2
f 21 = 3
f 22 = 0
f 23 = 4
f 24 = 2
f 25 = -1
f 26 = 0
f 27 = 0
f 28 = -4
f 29 = -2
f 30 = -14
Now suppose you were asked to find the definition of f using a proper, small mathematical formula instead of an enumeration of values. That is, the answer should be f x = floor(tan(x*x-3)) (or similar), because that is a small formula that is correct for every input. How would you do it?

So let's simplify. You want a function such that
f 1 = 10
f 2 = 3
f 3 = 8
There exists a formula for immediately finding a polynomial function which meets these demands. In particular
f x = 6 * x * x - 25 * x + 29
works. It turns out to be the case that if you have the graph of any function
{ (x_1, y_1), (x_2, y_2), ..., (x_i, y_i) }
you can immediately build a polynomial which exactly matches those inputs and outputs.
So, given that polynomials like this exist you're never going to solve your problem (finding a particular solution like floor(tan(x*x-3))) without enforcing more constraints. In particular, if you don't somehow outlaw or penalize polynomials then I'm always going to deliver them to you.
In general, what you'd like to do is (a) define a search space and (b) define a metric of fitness, also known as a loss function. If your search space is finite then you have yourself a solution immediately: rank every element of your search space according to your loss function and select randomly from the set of solutions which tie for best.
What it sounds like you're asking for is much harder though—if you're looking through the space of all possible programs then that space is unbelievably large. Searching it exhaustively is impossible unless we constrain ourselves heavily or accept approximation. Secondly, we must have very good understanding of your loss function and how it interacts with the search space as we'll want to make intelligent guesses to move forward through this vast space.
You mention genetic algorithms—they're often lauded for this kind of work and indeed they can be a method of driving search through a large space with an uncertain loss function, but they also fail as often as they succeed. Someone who is genuinely skilled at using genetic algorithms to solve problems will spend all of their time crafting the search space and the loss function to direct the algorithm toward meaningful answers.
Now this can be done for general programs if you're careful. In fact, this was the subject of last year's ICFP programming contest. In particular, search on this page for "Rules of the ICFP Contest 2013" to see the set up.

I think feed forward neural network (FFNN) and genetic programming (GP) are good techniques for complicated function simulation.
if you need function as polynomials use the GP otherwise FFNN is very simple and the matlab have a library for it.

I think the "interpolation" don't get what I am asking. Maybe I was not clear enough, but fortunately I've managed to get a semi-satisfactory answer to my question using a brute-force search algorithm myself. Using only a list of input/output pairs, as presented in the question, I was able to recover the original function. The comments on this snippet should explain it:
import Control.Monad.Omega
{- First we define a simple evaluator for mathematical expressions -}
data A = Add A A | Mul A A | Div A A | Sub A A | Pow A A |
Sqrt A | Tan A | Sin A | Cos A |
Num Float | X deriving (Show)
eval :: A -> Float -> Float
eval (Add a b) x = eval a x + eval b x
eval (Mul a b) x = eval a x * eval b x
eval (Div a b) x = eval a x / eval b x
eval (Sub a b) x = eval a x - eval b x
eval (Pow a b) x = eval a x ** eval b x
eval (Sqrt a) x = sqrt (eval a x)
eval (Tan a) x = tan (eval a x)
eval (Sin a) x = sin (eval a x)
eval (Cos a) x = cos (eval a x)
eval (Num a) x = a
eval X x = x
{- Now we enumerate all possible terms of that grammar -}
allTerms = do
which <- each [1..15]
if which == 1 then return X
else if which == 2 then do { x <- allTerms; y <- allTerms; return (Add x y) }
else if which == 3 then do { x <- allTerms; y <- allTerms; return (Mul x y) }
else if which == 4 then do { x <- allTerms; y <- allTerms; return (Div x y) }
else if which == 5 then do { x <- allTerms; y <- allTerms; return (Sub x y) }
else if which == 6 then do { x <- allTerms; y <- allTerms; return (Pow x y) }
else if which == 7 then do { x <- allTerms; y <- allTerms; return (Sqrt x) }
else if which == 8 then do { x <- allTerms; y <- allTerms; return (Tan x) }
else if which == 9 then do { x <- allTerms; y <- allTerms; return (Sin x) }
else if which == 10 then do { x <- allTerms; y <- allTerms; return (Cos x) }
else return (Num (which-10))
{- Then we create 20 input/output pairs of a random function -}
fun x = x+tan(x*x)
maps = let n=20 in zip [1..n] (map fun [1..n])
{- This tests a function in our language against a map of in/out pairs -}
check maps f = all test maps where
test (a,b) = (eval f a) == b
{- Naw lets see if a brute-force search can recover the original program
from the list of input/output pairs alone! -}
main = print $ take 1 $ filter (check maps) (runOmega allTerms)
{- Ouput: [Add X (Tan (Mul X X))]
Yay! As much as there are infinite possible solutions,
the first solution is actually our initial program.
-}

One possible definition goes like this:
f 0 = 0
f 1 = 2
f 2 = 1
f 3 = -1
f 4 = 0
f 5 = 0
f 6 = -76
f 7 = -3
f 8 = 3
f 9 = -1
f 10 = -1
f 11 = -6
f 12 = -1
f 13 = -1
f 14 = 4
f 15 = -2
f 16 = -10
f 17 = 0
f 18 = 0
f 19 = -1
f 20 = 2
f 21 = 3
f 22 = 0
f 23 = 4
f 24 = 2
f 25 = -1
f 26 = 0
f 27 = 0
f 28 = -4
f 29 = -2
f 30 = -14

how to generate a series representing the binary expansion of 'e'

I'm trying to find the first 100,000 binary digits in the expansion of 'e'. Is there an algorithm to generate the binary digits of 'e' as a infinite list?

Here's an unbounded spigot for e in Haskell:
main = print $ stream (1,0,1) [(n, a*d, d) | (n,d,a) <- map f [1..]]
where
f k = (1, k, 1)
stream z (x:xs)
| lbound == approx z 2 = lbound : stream (mul (10, -10*lbound, 1) z) (x:xs)
| otherwise = stream (mul z x) xs
where
lbound = approx z 1
approx (a,b,c) n = (a*n + b) `div` c
mul (a,b,c) (d,e,f) = (a*d, a*e + b*f, c*f)
Based on the Programming Praxis unbounded spigot for e and pi, which in turn is derived from Gibbon's first unbounded spigot for pi.
$ runhaskell A.hs
[2,7,1,8,2,8,1,8,2,8,4,5,9,0,4,5,2,3,5,3,6, ^C
I'd recommend Gibbon's paper if you're interested in these fun algorithms.

You might be interested in using CReal for this. For 100,000 binary digits, 30,200 decimal digits is enough:
Prelude> 100000 * logBase 10 2
30102.999566398114
Prelude> :m + Data.Number.CReal
Prelude> :set +s
Prelude Data.Number.CReal> last $ showCReal 1000 (exp 1)
'4'
(0.34 secs, 34061824 bytes)
Prelude Data.Number.CReal> last $ showCReal 2000 (exp 1)
'4'
(1.25 secs, 104478784 bytes)
Prelude Data.Number.CReal> last $ showCReal 4000 (exp 1)
'7'
(5.96 secs, 355775928 bytes)
Prelude Data.Number.CReal> last $ showCReal 8000 (exp 1)
'2'
(20.89 secs, 1298942504 bytes)
This pattern looks about quadratic to me, so computing the first 30,200 digits of exp 1 looks like it might reasonably finish in about five or six minutes here on my machine. A patch to output in binary directly (and therefore avoid converting to decimal and back) would likely be accepted.
edit: Projection satisfied, just under six minutes of compute time!
Prelude Data.Number.CReal> showCReal 30200 (exp 1)
"2.718281828459045235360287471352662497757247093699959574966967627724076630353547594571382178525166427427466391932003059921817413596629043572900334...middle snipped due to StackOverflow message limit...39106913376148418348845963656215266103322394174671"
(349.44 secs, 17096829912 bytes)

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Why parallelizing this code yields almost no performance improvement on six core machine? - performance

Related

Improve performance of finding graph diameter in Haskell

Haskell: How to change algorithm to work on any size of list?

Trying to create an efficient algorithm for a function in Haskell

How do you find the definition of a function when all you have is a huge set of input/ouput pairs?

how to generate a series representing the binary expansion of 'e'

Categories

Resources