Reduce memory allocation and increase speed in Julia code

Reduce memory allocation and increase speed in Julia code - performance

I am trying to run multiple regressions for all possible combinations of selected variables. My code is running but it takes a long time and the memory allocation seems very big. Is there a way to optimize this:
using DataFrames
using Statistics
using GLM
using Combinatorics
using Distributions
### read the data
reg_dat = DataFrame(
sample = wsample([1,2],[0.8,0.2], 1000000),y_var = rand(1000000),
ind = rand(1000000), brak = rand(1000000), times = rand(1000000),
tiny = rand(1000000), regr = rand(1000000), breaker = 1 .- rand(1000000),
x_var = 10*(1 .- rand(1000000)), kink = rand(1000000), h_var = rand(1000000),
ind_x = rand(1000000), brak_x = 1 .- rand(1000000), times_x = 1 .- rand(1000000),
tiny_x = 1 .- rand(1000000), regr_x = rand(1000000), breaker_x = rand(1000000),
t_var = rand(1000000), x_var_x = rand(1000000), kink_x = rand(1000000),
units = rand((1,2,3,4,6,7,8,9,15,78,76,10,23,45,54,87,98), 1000000),
trent = 1:1000000, rz = rand(string.(1:25), 1000000)
)
# find all possible combinations of the independent variables
function combinat(names::Array{String,1},indices::Array{Int64,1},N::Int64)::Vector{Array{String,1}}
cols = Vector{String}()
coms = Vector{Array{String}}()
for i in indices
push!(cols, names[i]::String)
end
for j in 1:N
append!(coms, collect(combinations(cols,j))::Array{Array{String,1},1})
end
return coms
end
combs = combinat(names(reg_dat),[3:20...], length([3:20...]))
### Find all unique units
units = unique(reg_dat.units)
## Convert combinations to regression formulas
function convert_to_formula(vals::Array{Array{String,1},1})::Array{FormulaTerm{Term,R} where R,1}
forms = Array{FormulaTerm{Term,R} where R,1}()
for i in vals
new_i = [i;["rz","trent"]]
symbs = Array{Symbol,1}()
for j in new_i
push!(symbs, Symbol(j))
end
push!(forms, Term(:y_var) ~ sum(term.(symbs)))
end
return forms
end
# Run regressions for each combination of units and combs
function logit_run(data::DataFrame, units::Array{Int64,1}, ind_vars::Array{Array{String,1},1})::Array{Tuple{Int64,String,Float64},1}
forms = convert_to_formula(ind_vars)
out = Array{Tuple{Int64,String,Float64},1}()
for i in units
data_train = data[(data.sample .== 1) .& (data.units .== i),:]
data_test = data[(data.sample .== 2) .& (data.units .== i),:]
for j in forms
try
logit = glm(j, data_train, Binomial(), LogitLink(), contrasts = Dict(:rz => DummyCoding()))
devs = data_test.y_var - predict(logit,data_test)
push!(out, (i, string(j), sqrt(mean(devs .* devs))))
catch
push!(out, (i, string(j), NaN))
end
end
end
return out
end
Now when I run:
#time logit_run(reg_dat, units, combs[1:10])
> 18.413586 seconds (26.24 M allocations: 10.644 GiB, 10.50% gc time)
170-element Array{Tuple{Int64,String,Float64},1}:
(1, "y_var ~ ind + rz + trent", 0.2880341157225821)
(1, "y_var ~ brak + rz + trent", 0.2880411138604235)
(1, "y_var ~ times + rz + trent", 0.2880380466963764)
(1, "y_var ~ tiny + rz + trent", 0.2880396984065766)
(1, "y_var ~ regr + rz + trent", 0.2880343939542883)
(1, "y_var ~ breaker + rz + trent", 0.2880393689495619)
(1, "y_var ~ x_var + rz + trent", 0.288038968246708)
(1, "y_var ~ kink + rz + trent", 0.288043185030096)
(1, "y_var ~ h_var + rz + trent", 0.28804607747341865)
(1, "y_var ~ ind_x + rz + trent", 0.2880387457490556)
(98, "y_var ~ ind + rz + trent", 0.29044425999770246)
(98, "y_var ~ brak + rz + trent", 0.2904354957341227)
(98, "y_var ~ times + rz + trent", 0.2904391507891304)
The memory allocation is high and it is not as fast as I would want it. I will be running millions of iterations with this and will want 170 elements to be faster than this. Any help is appreciated.
Update:
Using a low level API did not quite work for me so I found a python implementation of the logistic regression and wrote it in Julia
### Adding an intercept of ones
reg_dat[!, :Intercept] = ones(nrow(reg_dat))
### Creating dummies for the rz variable
for i in unique(reg_dat.rz)[2:end]
reg_dat[!, Symbol(i)] = ifelse.(reg_dat[!, :rz] .== i, 1, 0)
end
# find all possible combinations
function combinat(indices::Array{Int64,1},N::Int64)::Vector{Array{Int64,1}}
coms = Vector{Array{Int64}}()
for j in 1:N
append!(coms, collect(combinations(indices,j)))
end
return coms
end
combins = combinat([3:20...], length([3:20...]))
unit_s = unique(reg_dat.units)
### Now the logistic regression function from scratch
function log_reod(unit_s::Array{Int64,1}, combos::Array{Array{Int64,1},1}, cols::Array{String,1}, mydata::DataFrame, iterations::Int64)
out = Array{Tuple{Int64,String,Float64},1}()
Threads.#threads for j in combos::Array{Array{Int64,1},1}
Threads.#threads for i in unit_s::Array{Int64,1}
try
train = mydata[(mydata.sample .== 1) .& (mydata.units .== i),:]
test = mydata[(mydata.sample .== 2) .& (mydata.units .== i),:]
X = Matrix(train[!, [24; j;[22,25:48...]]])[:,:]
y = train[!, :y_var]
X_test = Matrix(test[!, [24; j;[22,25:48...]]])[:,:]
y_test = test[!, :y_var]
w = zeros(size(X)[2])
y_bar = mean(y)
w_init = log(y_bar/(1-y_bar))
converged = false
nll_sequence = Array{Float64,1}(undef,iterations)
ab = Array{Tuple{Float64, Bool}, 1}(undef,iterations)
Threads.#threads for i in 1:iterations
h = X*w
p = 1 ./ (1 .+ exp.(-h))
p_adj = p
p_adj[p_adj .== 1.0] .= 0.99999999
nll = -(1 - y'log.(1 .- p_adj)) + y'log.(p_adj)
nll_sequence[i] = nll
if i .> 1
if !converged & (abs(nll_sequence[end]-nll_sequence[end-1]) < 0.000001)
converged = true
else
converged = false
end
else
converged = false
end
s = p .* (1 .- p)
S = Diagonal(s)
arb_small = ones(length(s)) .* 0.000001
arb_small[s .!= 0] = ((y.-p)./s)[s .!= 0]
z = h + arb_small
w = ((inv(X'*S*X)*X')*S)*z
pred = exp.(X_test*w) ./ (1 .+ exp.(X_test*w))
rmse = sqrt(mean((y_test - pred) .* (y_test - pred)))
ab[i] = (rmse, converged)
end
global ab_fin = ab[findfirst(getfield.(ab,2))-1][1]
catch
global ab_fin = NaN
end
push!(out, (i, join(cols[j],","), ab_fin))
end
end
return out
end
This seems to work. However, memory allocation is worse and not as fast as the implementation with the GLM (high level) API. Why is this the case?
#time log_reod(unit_s, combins[1:10],names(reg_dat), reg_dat, 15)
171.060764 seconds (5.88 M allocations: 91.736 GiB, 12.45% gc time)
170-element Array{Tuple{Int64,String,Float64},1}:
(4, "ind", 0.29000435564005556)
(15, "ind", 0.28887795028663027)
(8, "ind", 0.286764063267709)
(98, "ind", 0.28941319680023303)
(76, "ind", 0.2898017575615879)
(3, "ind", 0.2910086150956134)
(45, "ind", 0.288856696122843)
(9, "ind", 0.287618950665158)
(1, "ind", 0.2884813713274216)
(2, "ind", 0.28729283992016885)
(7, "ind", 0.28700362633959264)
(6, "ind", 0.2873866890697135)
(10, "ind", 0.28889513223197577)
⋮
(3, "ind_x", 0.29102070956343595)
(45, "ind_x", 0.28884184457957934)
(9, "ind_x", 0.2876167225404162)
(1, "ind_x", 0.2884805471334929)
(2, "ind_x", 0.2873338122136366)
(7, "ind_x", 0.2870078816051725)
(6, "ind_x", 0.28739698518229273)
(10, "ind_x", 0.28891515579773225)
(23, "ind_x", 0.2898286130508298)
(78, "ind_x", 0.2890331733804241)
(54, "ind_x", 0.28855221319296853)
(87, "ind_x", 0.2875034656519413)

Actually I have misdirected you - sorry for this.
GLM.jl does not support views. Still you can reduce the time if you use matrices directly, but not much unfortunately.
Data preparation
df = DataFrame(rand(10^6, 10))
df.y = sum.(eachrow(df)) .+ 10 .* randn(10^6) .< 5
function some_models(df)
[glm(#formula(y~x1+x2+x3+x4+x5+x6+x7+x8+x9+x10), df, Binomial(), LogitLink()),
glm(#formula(y~x1+x2+x3+x4+x5+x6+x7+x8+x9), df, Binomial(), LogitLink()),
glm(#formula(y~x1+x2+x3+x4+x5+x6+x7+x8+x10), df, Binomial(), LogitLink()),
glm(#formula(y~x1+x2+x3+x4+x5+x6+x7+x9+x10), df, Binomial(), LogitLink()),
glm(#formula(y~x1+x2+x3+x4+x5+x6+x8+x9+x10), df, Binomial(), LogitLink()),
glm(#formula(y~x1+x2+x3+x4+x5+x7+x8+x9+x10), df, Binomial(), LogitLink()),
glm(#formula(y~x1+x2+x3+x4+x6+x7+x8+x9+x10), df, Binomial(), LogitLink()),
glm(#formula(y~x1+x2+x3+x5+x6+x7+x8+x9+x10), df, Binomial(), LogitLink()),
glm(#formula(y~x1+x2+x4+x5+x6+x7+x8+x9+x10), df, Binomial(), LogitLink()),
glm(#formula(y~x1+x3+x4+x5+x6+x7+x8+x9+x10), df, Binomial(), LogitLink()),
glm(#formula(y~x2+x3+x4+x5+x6+x7+x8+x9+x10), df, Binomial(), LogitLink())]
end
function some_models2(df)
X = [ones(nrow(df)) Matrix(df[!, 1:end-1])]
y = df.y
[fit(GeneralizedLinearModel, X, y, Binomial(), LogitLink()),
fit(GeneralizedLinearModel, X[:, Not(11)], y, Binomial(), LogitLink()),
fit(GeneralizedLinearModel, X[:, Not(10)], y, Binomial(), LogitLink()),
fit(GeneralizedLinearModel, X[:, Not(9)], y, Binomial(), LogitLink()),
fit(GeneralizedLinearModel, X[:, Not(8)], y, Binomial(), LogitLink()),
fit(GeneralizedLinearModel, X[:, Not(7)], y, Binomial(), LogitLink()),
fit(GeneralizedLinearModel, X[:, Not(6)], y, Binomial(), LogitLink()),
fit(GeneralizedLinearModel, X[:, Not(5)], y, Binomial(), LogitLink()),
fit(GeneralizedLinearModel, X[:, Not(4)], y, Binomial(), LogitLink()),
fit(GeneralizedLinearModel, X[:, Not(3)], y, Binomial(), LogitLink()),
fit(GeneralizedLinearModel, X[:, Not(2)], y, Binomial(), LogitLink()),
fit(GeneralizedLinearModel, X[:, Not(1)], y, Binomial(), LogitLink())
]
end
Benchmarking:
julia> some_models(df); # first run
julia> #time some_models(df); # second run
8.955404 seconds (10.72 k allocations: 4.713 GiB, 5.36% gc time)
julia> some_models2(df); # first run
julia> #time some_models2(df); # second run
8.598572 seconds (1.20 k allocations: 3.412 GiB, 8.65% gc time)

Related

Tricks to improve the performance of a cunstom function in Julia

I am replicating using Julia a sequence of steps originally made in Matlab. In Octave, this procedure takes 1.4582 seconds and in Julia (using Jupyter) it takes approximately 10 seconds. I'll try to be brief in the scripts. My goal is to achieve or improve Octave's performance. First of all, I will describe my variables and some function:
zgrid (double 1x7 size)
kgrid (double 500x1 size)
V0 (double 500x7 size)
P (double 7x7 size) a transition matrix
delta and beta are fixed parameters.
F(z,k) and u(c) are particular functions and are specified in the Julia script.
% Octave script
% V0 is given
[K, Z, K2] = meshgrid(kgrid, zgrid, kgrid);
K = permute(K, [2, 1, 3]);
Z = permute(Z, [2, 1, 3]);
K2 = permute(K2, [2, 1, 3]);
C = max(f(Z,K) + (1-delta)*K - K2,0);
U = u(C);
EV = V0*P';% EV is a 500x7 matrix size
EV = permute(repmat(EV, 1, 1, 500), [3, 2, 1]);
H = U + beta*EV;
[TV, index] = max(H, [], 3);
In Julia, I created a function that replicates this procedure. I used loops, but it has a performance 9 times longer.
% Julia script
% V0 is the input of my T operator function
V0 = repeat(sqrt.(kgrid), outer = [1,7]);
F = (z,k) -> exp(z)*(k^α);
u = (c) -> (c^(1-μ) - 1)/(1-μ)
% parameters
α = 1/3
β = 0.987
δ = 0.012;
μ = 2
Kss = 48.1905148382166
kgrid = range(0.75*Kss, stop=1.25*Kss, length=500);
zgrid = [-0.06725382459813659, -0.044835883065424395, -0.0224179415327122, 0 , 0.022417941532712187, 0.04483588306542438, 0.06725382459813657]
function T(V)
E=V*P'
T1 = zeros(Float64, 500, 7 )
aux = zeros(Float64, 500)
for i = 1:7
for j = 1:500
for l = 1:500
c= maximum( (F(zrid[i],kgrid[j]) +(1-δ)*kgrid[j] - kgrid[l],0))
aux[l] = u(c) + β*E[l,i]
end
T1[j,i] = maximum(aux)
end
end
return T1
end
I would very much like to improve my performance in Julia. I believe there is a way to improve, but I am new in Julia programming.

This code runs for me in 5ms. Note that I have made F and u into proper (not anonymous) functions, F_ and u_, but you could get a similar effect by making the anonymous functions const.
Your main problem is that you have a lot of non-const global variables, and also that your main function is doing unnecessary work multiple times, and creating an unnecessary array, aux.
The performance tips section in the manual is essential reading: https://docs.julialang.org/en/v1/manual/performance-tips/
F_(z,k) = exp(z) * (k^(1/3)); # you can still use α, but it must be const
u_(c) = (c^(1-2) - 1)/(1-2)
function T_(V, P, kgrid, zgrid, β, δ)
E = V * P'
T1 = similar(V)
for i in axes(T1, 2)
for j in axes(T1, 1)
temp = F_(zgrid[i], kgrid[j]) + (1-δ)*kgrid[j]
aux = -Inf
for l in eachindex(kgrid)
c = max(0.0, temp - kgrid[l])
aux = max(aux, u_(c) + β * E[l, i])
end
T1[j,i] = aux
end
end
return T1
end
Benchmark:
V0 = repeat(sqrt.(kgrid), outer = [1,7]);
zgrid = sort!(rand(1, 7); dims=2)
kgrid = sort!(rand(500, 1); dims=1)
P = rand(length(zgrid), length(zgrid))
#btime T_($V0, $P, $kgrid, $zgrid, $β, $δ);
# output: 5.126 ms (4 allocations: 54.91 KiB)

The following should perform much better. The most noticeable differences are that it calculates F 500x less, and doesn't rely on global variables.
function T(V,kgrid,zgrid,β,δ)
E=V*P'
T1 = zeros(Float64, 500, 7)
for j = 1:500
for i = 1:7
x = F(zrid[i],kgrid[j]) +(1-δ)*kgrid[j]
T1[j,i] = maximum(u(max(x - kgrid[l], 0)) + β*E[l,i] for l in 1:500)
end
end
return T1
end

How to perform advanced indexing in PyTorch?

Is there a way of doing the following without looping?
S, N, H = 9, 7, 4
a = torch.randn(S, N, H)
# tensor with integer values between 1, S of shape (N,)
lens = torch.randint(1, S + 1, (N,))
res = torch.zeros(N, H)
for i in range(N):
res[i] = a[lens[i] - 1, i, :]

Yes, I believe this works.
import torch
S, N, H = 9, 7, 4
a = torch.randn(S, N, H)
# tensor with integer values between 1, S of shape (N,)
lens = torch.randint(0, S, (N,))
i = torch.tensor(range(0,7))
res = torch.zeros(N, H)
res = a[lens, i, :]
print(res)
And why did you make lens 1 from S+1 and then do lens[i]-1 ? I just changed it so lens is 0 from S for convenience. However if you need lens to be 1 from S+1, you can change
res = a[lens, i, :]
to
res = a[lens-1, i, :]

tensorflow adapt for local rgb image classification

I was wondering how to adapt the following code from github batchnorm_five_layers to read in two classes (cats&dogs) from local image paths with image size 780x780 and RBG. Here is the uncommented code from the link:
# encoding: UTF-8
import tensorflow as tf
import tensorflowvisu
import math
from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets
tf.set_random_seed(0)
# Download images and labels into mnist.test (10K images+labels) and mnist.train (60K images+labels)
mnist = read_data_sets("data", one_hot=True, reshape=False, validation_size=0)
# input X: 28x28 grayscale images, the first dimension (None) will index the images in the mini-batch
X = tf.placeholder(tf.float32, [None, 28, 28, 1])
# correct answers will go here
Y_ = tf.placeholder(tf.float32, [None, 10])
# variable learning rate
lr = tf.placeholder(tf.float32)
# train/test selector for batch normalisation
tst = tf.placeholder(tf.bool)
# training iteration
iter = tf.placeholder(tf.int32)
# five layers and their number of neurons (tha last layer has 10 softmax neurons)
L = 200
M = 100
N = 60
P = 30
Q = 10
# Weights initialised with small random values between -0.2 and +0.2
# When using RELUs, make sure biases are initialised with small *positive* values for example 0.1 = tf.ones([K])/10
W1 = tf.Variable(tf.truncated_normal([784, L], stddev=0.1)) # 784 = 28 * 28
B1 = tf.Variable(tf.ones([L])/10)
W2 = tf.Variable(tf.truncated_normal([L, M], stddev=0.1))
B2 = tf.Variable(tf.ones([M])/10)
W3 = tf.Variable(tf.truncated_normal([M, N], stddev=0.1))
B3 = tf.Variable(tf.ones([N])/10)
W4 = tf.Variable(tf.truncated_normal([N, P], stddev=0.1))
B4 = tf.Variable(tf.ones([P])/10)
W5 = tf.Variable(tf.truncated_normal([P, Q], stddev=0.1))
B5 = tf.Variable(tf.ones([Q])/10)
def batchnorm(Ylogits, is_test, iteration, offset, convolutional=False):
exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, iteration) # adding the iteration prevents from averaging across non-existing iterations
bnepsilon = 1e-5
if convolutional:
mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
else:
mean, variance = tf.nn.moments(Ylogits, [0])
update_moving_everages = exp_moving_avg.apply([mean, variance])
m = tf.cond(is_test, lambda: exp_moving_avg.average(mean), lambda: mean)
v = tf.cond(is_test, lambda: exp_moving_avg.average(variance), lambda: variance)
Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
return Ybn, update_moving_everages
def no_batchnorm(Ylogits, is_test, iteration, offset, convolutional=False):
return Ylogits, tf.no_op()
# The model
XX = tf.reshape(X, [-1, 784])
# batch norm scaling is not useful with relus
# batch norm offsets are used instead of biases
Y1l = tf.matmul(XX, W1)
Y1bn, update_ema1 = batchnorm(Y1l, tst, iter, B1)
Y1 = tf.nn.relu(Y1bn)
Y2l = tf.matmul(Y1, W2)
Y2bn, update_ema2 = batchnorm(Y2l, tst, iter, B2)
Y2 = tf.nn.relu(Y2bn)
Y3l = tf.matmul(Y2, W3)
Y3bn, update_ema3 = batchnorm(Y3l, tst, iter, B3)
Y3 = tf.nn.relu(Y3bn)
Y4l = tf.matmul(Y3, W4)
Y4bn, update_ema4 = batchnorm(Y4l, tst, iter, B4)
Y4 = tf.nn.relu(Y4bn)
Ylogits = tf.matmul(Y4, W5) + B5
Y = tf.nn.softmax(Ylogits)
update_ema = tf.group(update_ema1, update_ema2, update_ema3, update_ema4)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=Ylogits, labels=Y_)
cross_entropy = tf.reduce_mean(cross_entropy)*100
# accuracy of the trained model, between 0 (worst) and 1 (best)
correct_prediction = tf.equal(tf.argmax(Y, 1), tf.argmax(Y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# matplotlib visualisation
allweights = tf.concat([tf.reshape(W1, [-1]), tf.reshape(W2, [-1]), tf.reshape(W3, [-1])], 0)
allbiases = tf.concat([tf.reshape(B1, [-1]), tf.reshape(B2, [-1]), tf.reshape(B3, [-1])], 0)
# to use for sigmoid
#allactivations = tf.concat([tf.reshape(Y1, [-1]), tf.reshape(Y2, [-1]), tf.reshape(Y3, [-1]), tf.reshape(Y4, [-1])], 0)
# to use for RELU
allactivations = tf.concat([tf.reduce_max(Y1, [0]), tf.reduce_max(Y2, [0]), tf.reduce_max(Y3, [0]), tf.reduce_max(Y4, [0])], 0)
alllogits = tf.concat([tf.reshape(Y1l, [-1]), tf.reshape(Y2l, [-1]), tf.reshape(Y3l, [-1]), tf.reshape(Y4l, [-1])], 0)
I = tensorflowvisu.tf_format_mnist_images(X, Y, Y_)
It = tensorflowvisu.tf_format_mnist_images(X, Y, Y_, 1000, lines=25)
datavis = tensorflowvisu.MnistDataVis(title4="Logits", title5="Max activations across batch", histogram4colornum=2, histogram5colornum=2)
# training step, the learning rate is a placeholder
train_step = tf.train.AdamOptimizer(lr).minimize(cross_entropy)
# init
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
# You can call this function in a loop to train the model, 100 images at a time
def training_step(i, update_test_data, update_train_data):
# training on batches of 100 images with 100 labels
batch_X, batch_Y = mnist.train.next_batch(100)
max_learning_rate = 0.03
min_learning_rate = 0.0001
decay_speed = 1000.0
learning_rate = min_learning_rate + (max_learning_rate - min_learning_rate) * math.exp(-i/decay_speed)
# compute training values for visualisation
if update_train_data:
a, c, im, al, ac = sess.run([accuracy, cross_entropy, I, alllogits, allactivations], {X: batch_X, Y_: batch_Y, tst: False})
print(str(i) + ": accuracy:" + str(a) + " loss: " + str(c) + " (lr:" + str(learning_rate) + ")")
datavis.append_training_curves_data(i, a, c)
datavis.update_image1(im)
datavis.append_data_histograms(i, al, ac)
# compute test values for visualisation
if update_test_data:
a, c, im = sess.run([accuracy, cross_entropy, It], {X: mnist.test.images, Y_: mnist.test.labels, tst: True})
print(str(i) + ": ********* epoch " + str(i*100//mnist.train.images.shape[0]+1) + " ********* test accuracy:" + str(a) + " test loss: " + str(c))
datavis.append_test_curves_data(i, a, c)
datavis.update_image2(im)
# the backpropagation training step
sess.run(train_step, {X: batch_X, Y_: batch_Y, lr: learning_rate, tst: False})
sess.run(update_ema, {X: batch_X, Y_: batch_Y, tst: False, iter: i})
datavis.animate(training_step, iterations=10000+1, train_data_update_freq=20, test_data_update_freq=100, more_tests_at_start=True)
print("max test accuracy: " + str(datavis.get_max_test_accuracy()))

To answer your question in the comments: this is probably what you want to change your code into:
# input X: images, the first dimension (None) will index the images in the mini-batch
X = tf.placeholder(tf.float32, [None, 780, 780, 3])
# correct answers will go here
Y_ = tf.placeholder(tf.float32, [None, 2])
And an image can be read like this:
from scipy import misc
input = misc.imread('input.png')
Now it might be best to follow a Tensorflow tutorial. This one is really good: kadenze.com/courses/creative-applications-of-deep-learning-with-tensorflow-iv/info
Good luck!

F# better performance when mapping a 2DArray -> arraymodule.mapindexed

What would be a more performant way to process this 2DArray without 3rd party?
#time
let ar = array2D[[5.0; 6.0; 7.0; 8.0]; [1.0; 2.0; 3.0; 4.0]]
[0..5000000]
let a2 = ar |> Array2D.mapi(fun rowi coli value -> (value + 1.6) * double(coli + 6) * double(rowi + 7))

If you run the above code, it takes about 0ms, so I it really depends on the context in which you are calling it. If you just run it in a loop 1M times, then it takes about 600ms on my machine:
for i in 0 .. 1000000 do
let a2 = ar |> Array2D.mapi(fun rowi coli value ->
(value + 1.6) * double ((coli + 6) * (rowi + 7)))
()
Here, most of the time is spent allocating the result array - for each iteration, we need to allocate a new 2D array to store the result. This gives you nice functional properties (the results can be shared because they're not mutated) but it is why it takes longer.
You can use some mutation and avoid this. This depends on the context, and so that's why you probably won't get a useful answer here.
For example, in this artificial 1M loop example, I could just allocate one array to store the results and then write there repeatedly:
let res = ar |> Array2D.map id
for i in 0 .. 1000000 do
for x in 0 .. ar.GetLength(0) - 1 do
for y in 0 .. ar.GetLength(1) - 1 do
res.[x, y] <- (ar.[x, y] + 1.6) * double ((x + 6) * (y + 7))
This takes about 100ms, so that gives you an idea about the cost of the allocation. But then, you should not do this change if it can break your program because now you'd be using mutable arrays...

I did some measurements of this problem which I thought could be interesting.
I created 8 different test cases and ran over 3 differently sized matrixes; 1000x1000, 100x100 and 10x10.
In addition I ran the tests in x64 as well as x86.
In the end I ended up with 48 test results presented in two graphs. The y-axis is the execution time in milliseconds.
Creating Zero Matrix - the cost of creating a zero matrix
Copying Matrix - the cost of copying a matrix with Array2D.copy
Mapping Matrix with id - the cost of copying a matrix with Array2D.copy map id
Original Algorithm - the cost of the algorithm posted by OP
Tomas Petricek Algorithm - the cost of the algorithm by Tomas
Modified Tomas Petricek Algorithm - the cost of the modified algorithm to use Array.zeroCreate
Reverse Algorithm - the cost of iterating over the matrix in reverse
Flipped x,y Algorithm - the cost of the modified algorithm but flipping x,y iteration order
Some observations
Tomas wanted to demonstrate the cost of the copy compared to the computation so in his example the copy was not part of the inner loop. I wanted to include his code so I moved the copy into the inner loop to be able to compare with the others. The modified Tomas algorithm is the same code but uses Array2D.zeroCreate to create a fresh matrix. When writing this I realize it would have been better to call both of them modified.
On .NET 4.5.2 x64 is doing significantly better in general
There are performance benefits of using Array2D.zeroCreate and populate the matrix over using Array2D.copy
For large matrixes the x,y iteration order is extremely important. For small matrixes it doesn't matter. This is because how CPU caches works
Iterating reverse order over a the array seems to give a small benefit. The reason is that it's cheaper to check y >= 0 than y < xl.
The reverse algorithm has to use tail-recursion as F# for y = (yl - 1) downto 0 uses y > variable_that_is_always_minus_1 to check for loop end. With tail-recursion we can force y >= 0
For smaller sized Matrixes the cost of creating them and the cost of the GC is increasing.
The code used to generate the measurements.
open System
open System.IO
open System.Diagnostics
let clock =
let sw = Stopwatch ()
sw.Start ()
sw
let collectionCount () =
GC.CollectionCount 0 + GC.CollectionCount 1 + GC.CollectionCount 2
let timeIt (n : string) (outer : int) (a : unit -> 'T) : 'T*int64 =
printfn "Timing '%s'..." n
let v = a ()
let t = clock.ElapsedMilliseconds
for i in 1..outer do
a () |> ignore
let e = clock.ElapsedMilliseconds - t
printfn " took %d ms" e
v, e
[<EntryPoint>]
let main argv =
let random = Random 19740531
let total = 100000000
let outers = [|100;10000;1000000|]
use output = new StreamWriter ".\output.tsv"
"Dimensions\tName\tSum\tCollectionCounts\tMilliseconds" |> output.WriteLine
for outer in outers do
let inner = total / outer
let dim = inner |> float |> sqrt |> int32
let ar = Array2D.init dim dim (fun _ _ -> random.NextDouble ())
printfn "New test run, matrix dimensions are %dx%d" dim dim
let run = sprintf "%d_%d" dim dim
let perf_zero () : float[,] =
let xl = ar.GetLength(0)
let yl = ar.GetLength(1)
let res = Array2D.zeroCreate xl yl
res
let perf_copy () : float[,] =
Array2D.copy ar
let perf_id () : float[,] =
ar |> Array2D.map id
let perf_op () : float[,] =
ar |> Array2D.mapi(fun rowi coli value -> (value + 1.6) * double(coli + 6) * double(rowi + 7))
let perf_tp () : float[,] =
let res = ar |> Array2D.map id
for x in 0 .. ar.GetLength(0) - 1 do
for y in 0 .. ar.GetLength(1) - 1 do
res.[x, y] <- (ar.[x, y] + 1.6) * double ((x + 6) * (y + 7))
res
let perf_tpm () : float[,] =
let xl = ar.GetLength(0)
let yl = ar.GetLength(1)
let res = Array2D.zeroCreate xl yl
for x in 0 .. xl - 1 do
for y in 0 .. yl - 1 do
res.[x, y] <- (ar.[x, y] + 1.6) * double ((x + 6) * (y + 7))
res
let perf_tpmf () : float[,] =
let xl = ar.GetLength(0)
let yl = ar.GetLength(1)
let res = Array2D.zeroCreate xl yl
for y in 0 .. yl - 1 do
for x in 0 .. xl - 1 do
res.[x, y] <- (ar.[x, y] + 1.6) * double ((x + 6) * (y + 7))
res
let perf_tr () : float[,] =
let xl = ar.GetLength(0)
let yl = ar.GetLength(1)
let res = Array2D.zeroCreate xl yl
let rec loopy x y =
if y >= 0 then
res.[x, y] <- (ar.[x, y] + 1.6) * double ((x + 6) * (y + 7))
loopy x (y - 1)
else
()
and loopx x =
if x >= 0 then
loopy x (yl - 1)
loopx (x - 1)
else
()
loopx (xl - 1)
res
let testCases =
[|
"Creating Zero Matrix" , perf_zero
"Copying Matrix" , perf_copy
"Mapping Matrix with id" , perf_id
"Original Algorithm" , perf_op
"Tomas Petricek Algorithm" , perf_tp
"Modified Tomas Petricek Algorithm" , perf_tpm
"Reverse Algoritm" , perf_tr
"Flipped x,y Algoritm" , perf_tpmf
|]
for name, a in testCases do
let pcc = collectionCount ()
let vs, t = timeIt name outer a
let sum = ref 0.
vs |> Array2D.iter (fun v -> sum := !sum + v)
let dcc = collectionCount () - pcc
sprintf "%s\t%s\t%f\t%d\t%d" run name !sum dcc t |> output.WriteLine
0

As OP specified that his problem dealt with smaller Matrixes like 9x4 I did another set of metrics. However since I thought my previous answers held some interesting points on metrics with larger sizes I decided to create a new answer
I did some measurements of this problem which I thought could be interesting.
I created 9 different test cases and ran over it over a 10x5 matrix. All tests run in Release(obviously)/x64.
The first graph shows the execution time in milliseconds:
The second graph shows the number of GC collections during test run:
Creating Zero Matrix - the cost of creating a zero matrix
Copying Matrix - the cost of copying a matrix with Array2D.copy
Mapping Matrix with id - the cost of copying a matrix with Array2D.copy map id
Original Algorithm - the cost of the algorithm posted by OP
Tomas P Algorithm with Zero Init - the cost of the algorithm by Tomas with Array2D.zeroInit
Creating Zero Fixed Size Matrix - the cost of creating a zero fixed size matrix
Copying Fixed Size Matrix - the cost of creating a zero fixed size matrix
Fixed Size Algorithm - the cost of OP:s algorithm adapted to fixed size matrix
Fixed Size Updater - the cost of OP:s algorithm using an updater function
The Fixed Size Matrix is a struct that uses unsafe code to avoid GC allocations. It's written in C# but might be portable to F#. It should not be seen as production worthy code, more like an inspiration for something of your own creation.
Some observations:
Copying a Fixed Size matrix is quick
The Fixed Size Algorithm doesn't perform as good as one hoped. Potentially because JIT:er have to perform some extra lifting because of unsafe code
The Fixed Size Updater (which is similar to Array2D.iteri) has the best performance
As expected Fixed Size Matrixes don't create any GC pressure as it don't rely on GC allocation.
It's hard to judge for me if the Fixed Size Matrix is a viable path for OP but it's an option that might be worth considering.
F# code:
open System
open System.IO
open System.Diagnostics
open Unsafe
let clock =
let sw = Stopwatch ()
sw.Start ()
sw
let collectionCount () =
GC.CollectionCount 0 + GC.CollectionCount 1 + GC.CollectionCount 2
let createTimer (n : string) (a : unit -> 'T) (r : 'T -> 'TResult) : string*(int -> 'TResult*int64*int) =
n, fun outer ->
printfn "Timing '%s'..." n
let v = a () |> r
GC.Collect ()
GC.WaitForFullGCComplete () |> ignore
let pcc = collectionCount ()
let t = clock.ElapsedMilliseconds
for i in 1..outer do
a () |> ignore
let e = clock.ElapsedMilliseconds - t
let dcc = collectionCount () - pcc
printfn " took %d ms, collected %d times, result is %A" e dcc v
v, e, dcc
[<EntryPoint>]
let main argv =
let random = Random 19740531
let total = 300000000
use output = new StreamWriter ".\output.tsv"
"Name\tSum\tCollectionCounts\tMilliseconds" |> output.WriteLine
let cols = 5
let rows = 10
let inner = cols*rows
let outer = total / inner
let ar = Array2D.init rows cols (fun _ _ -> random.NextDouble ())
let mtx5x10 =
let mutable m = Matrix5x10 ()
ar |> Array2D.iteri (fun row col v -> (m.[col, row] <- v))
m
printfn "New test run, matrix dimensions are %dx%d" cols rows
let perf_zero () =
let xl = ar.GetLength(0)
let yl = ar.GetLength(1)
let res = Array2D.zeroCreate xl yl
res
let perf_copy () =
Array2D.copy ar
let perf_id () =
ar |> Array2D.map id
let perf_op () =
ar |> Array2D.mapi(fun rowi coli value -> (value + 1.6) * double(rowi + 6) * double(coli + 7))
let perf_tpm () =
let xl = ar.GetLength(0)
let yl = ar.GetLength(1)
let res = Array2D.zeroCreate xl yl
for x in 0 .. xl - 1 do
for y in 0 .. yl - 1 do
res.[x, y] <- (ar.[x, y] + 1.6) * double ((x + 6) * (y + 7))
res
let perf_fzero () =
let m = Matrix5x10()
m
let perf_fcopy () =
let m = mtx5x10
m
let perf_fs () =
let mutable m = Matrix5x10 ()
for row = 0 to Matrix5x10.Rows - 1 do
for col = 0 to Matrix5x10.Columns - 1 do
m.[col, row] <- (mtx5x10.[col, row] + 1.6) * double ((row + 6) * (col + 7))
m
let perf_fsui = Func<int, int, double, double> (fun col row v -> (v + 1.6) * double ((row + 6) * (col + 7)))
let perf_fsu () =
let mutable m = mtx5x10
m.Update perf_fsui
m
let sumArray vs =
let sum = ref 0.
vs |> Array2D.iter (fun v -> sum := !sum + v)
!sum
let sumMatrix (mtx : Matrix5x10) =
let sum = ref 0.
mtx.Update (fun _ _ v -> sum := !sum + v; v)
!sum
let testCases =
[|
createTimer "Creating Zero Matrix" perf_zero sumArray
createTimer "Copying Matrix" perf_copy sumArray
createTimer "Mapping Matrix with id" perf_id sumArray
createTimer "Original Algorithm" perf_op sumArray
createTimer "Tomas P Algorithm with Zero Init" perf_tpm sumArray
createTimer "Creating Zero Fixed Size Matrix" perf_fzero sumMatrix
createTimer "Copying Fixed Size Matrix" perf_fcopy sumMatrix
createTimer "Fixed Size Algorithm" perf_fs sumMatrix
createTimer "Fixed Size Updater" perf_fsu sumMatrix
|]
for name, a in testCases do
let sum, t, dcc = a outer
sprintf "%s\t%f\t%d\t%d" name sum dcc t |> output.WriteLine
0
C# code (for those that care I generated this with T4):
namespace Unsafe
{
using System;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
[StructLayout(LayoutKind.Sequential)]
public struct Matrix5x10
{
double m_c0_r0;
double m_c1_r0;
double m_c2_r0;
double m_c3_r0;
double m_c4_r0;
double m_c0_r1;
double m_c1_r1;
double m_c2_r1;
double m_c3_r1;
double m_c4_r1;
double m_c0_r2;
double m_c1_r2;
double m_c2_r2;
double m_c3_r2;
double m_c4_r2;
double m_c0_r3;
double m_c1_r3;
double m_c2_r3;
double m_c3_r3;
double m_c4_r3;
double m_c0_r4;
double m_c1_r4;
double m_c2_r4;
double m_c3_r4;
double m_c4_r4;
double m_c0_r5;
double m_c1_r5;
double m_c2_r5;
double m_c3_r5;
double m_c4_r5;
double m_c0_r6;
double m_c1_r6;
double m_c2_r6;
double m_c3_r6;
double m_c4_r6;
double m_c0_r7;
double m_c1_r7;
double m_c2_r7;
double m_c3_r7;
double m_c4_r7;
double m_c0_r8;
double m_c1_r8;
double m_c2_r8;
double m_c3_r8;
double m_c4_r8;
double m_c0_r9;
double m_c1_r9;
double m_c2_r9;
double m_c3_r9;
double m_c4_r9;
public const int Columns = 5;
public const int Rows = 10;
unsafe public double this[int x, int y]
{
[MethodImpl (MethodImplOptions.AggressiveInlining)]
get
{
var i = 5 * y + x;
if (i < 0 || i >= 50)
{
throw new IndexOutOfRangeException ("0 <= x <= 5 && 0 <= y <= 10");
}
fixed (double * ms = &m_c0_r0)
{
return ms[i];
}
}
[MethodImpl (MethodImplOptions.AggressiveInlining)]
set
{
var i = 5 * y + x;
if (i < 0 || i >= 50)
{
throw new IndexOutOfRangeException ("0 <= x <= 5 && 0 <= y <= 10");
}
fixed (double * ms = &m_c0_r0)
{
ms[i] = value;
}
}
}
public void Update (Func<int, int, double, double> updater)
{
if (updater == null)
{
return;
}
m_c0_r0 = updater (0, 0, m_c0_r0);
m_c1_r0 = updater (1, 0, m_c1_r0);
m_c2_r0 = updater (2, 0, m_c2_r0);
m_c3_r0 = updater (3, 0, m_c3_r0);
m_c4_r0 = updater (4, 0, m_c4_r0);
m_c0_r1 = updater (0, 1, m_c0_r1);
m_c1_r1 = updater (1, 1, m_c1_r1);
m_c2_r1 = updater (2, 1, m_c2_r1);
m_c3_r1 = updater (3, 1, m_c3_r1);
m_c4_r1 = updater (4, 1, m_c4_r1);
m_c0_r2 = updater (0, 2, m_c0_r2);
m_c1_r2 = updater (1, 2, m_c1_r2);
m_c2_r2 = updater (2, 2, m_c2_r2);
m_c3_r2 = updater (3, 2, m_c3_r2);
m_c4_r2 = updater (4, 2, m_c4_r2);
m_c0_r3 = updater (0, 3, m_c0_r3);
m_c1_r3 = updater (1, 3, m_c1_r3);
m_c2_r3 = updater (2, 3, m_c2_r3);
m_c3_r3 = updater (3, 3, m_c3_r3);
m_c4_r3 = updater (4, 3, m_c4_r3);
m_c0_r4 = updater (0, 4, m_c0_r4);
m_c1_r4 = updater (1, 4, m_c1_r4);
m_c2_r4 = updater (2, 4, m_c2_r4);
m_c3_r4 = updater (3, 4, m_c3_r4);
m_c4_r4 = updater (4, 4, m_c4_r4);
m_c0_r5 = updater (0, 5, m_c0_r5);
m_c1_r5 = updater (1, 5, m_c1_r5);
m_c2_r5 = updater (2, 5, m_c2_r5);
m_c3_r5 = updater (3, 5, m_c3_r5);
m_c4_r5 = updater (4, 5, m_c4_r5);
m_c0_r6 = updater (0, 6, m_c0_r6);
m_c1_r6 = updater (1, 6, m_c1_r6);
m_c2_r6 = updater (2, 6, m_c2_r6);
m_c3_r6 = updater (3, 6, m_c3_r6);
m_c4_r6 = updater (4, 6, m_c4_r6);
m_c0_r7 = updater (0, 7, m_c0_r7);
m_c1_r7 = updater (1, 7, m_c1_r7);
m_c2_r7 = updater (2, 7, m_c2_r7);
m_c3_r7 = updater (3, 7, m_c3_r7);
m_c4_r7 = updater (4, 7, m_c4_r7);
m_c0_r8 = updater (0, 8, m_c0_r8);
m_c1_r8 = updater (1, 8, m_c1_r8);
m_c2_r8 = updater (2, 8, m_c2_r8);
m_c3_r8 = updater (3, 8, m_c3_r8);
m_c4_r8 = updater (4, 8, m_c4_r8);
m_c0_r9 = updater (0, 9, m_c0_r9);
m_c1_r9 = updater (1, 9, m_c1_r9);
m_c2_r9 = updater (2, 9, m_c2_r9);
m_c3_r9 = updater (3, 9, m_c3_r9);
m_c4_r9 = updater (4, 9, m_c4_r9);
}
}
}

Infinite Integral inconsistency? (adaptIntegrate)

I used
library('cubature')
adaptIntegrate(doubleintegral, lowerLimit = c(-2.5, -2), upperLimit = c(0, 2), x=x,r=r,m=m,n=n)$integral
to integrate the following function:
doubleintegral <- function(y,x,r,n,m){
max(((-y[1])^(n-1)*(y[2]-y[1])^(m-1)*exp((x[7]+x[4])*y[1]))*
exp(-x[4]*y[2] - (r[v]-y[2]-x[5]*x[3]+0.5*x[6]^2*x[3])^2/(2*x[6]^2*x[3])),0)}
I used this Example Parameters
x <- numeric()
x[1] = 42
x[2] = 21
x[3] = 1
x[4] = 72.9
x[5] = 0.0332
x[6] = 0.0311
x[7] = 16.8
r <- numeric()
r = 0.0006
v = 1
r[v] = -0.036
w = 1
n = 2
i = 1
m = 2
I don't understand how the following output is possible:
> adaptIntegrate(doubleintegral, lowerLimit = c(-130, -4), upperLimit = c(0, 4), x=x,r=r,m=m,n=n)$integral
[1] 1.12184e-07
> adaptIntegrate(doubleintegral, lowerLimit = c(-2.5, -2), upperLimit = c(0, 2), x=x,r=r,m=m,n=n)$integral
[1] 2.516489e-07
By widening the limits i get a smaller value. I faced the same problem with the function "integrate" which i applied to another function. How is this possible?
Does someone has another idea for this doubleintegral?
thanks for your help!

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Reduce memory allocation and increase speed in Julia code - performance

Related

Tricks to improve the performance of a cunstom function in Julia

How to perform advanced indexing in PyTorch?

tensorflow adapt for local rgb image classification

F# better performance when mapping a 2DArray -> arraymodule.mapindexed

Infinite Integral inconsistency? (adaptIntegrate)

Categories

Resources