Numpy version of rolling MAD (mean absolute deviation) - performance

How to make a rolling version of the following MAD function
from numpy import mean, absolute
def mad(data, axis=None):
return mean(absolute(data - mean(data, axis)), axis)
This code is an answer to this question
At the moment i convert numpy to pandas then apply this function, then convert the result back to numpy
pandasDataFrame.rolling(window=90).apply(mad)
but this is inefficient on larger data-frames. How to get a rolling window for the same function in numpy without looping and give the same result?

Here's a vectorized NumPy approach -
# From this post : http://stackoverflow.com/a/40085052/3293881
def strided_app(a, L, S ): # Window len = L, Stride len/stepsize = S
nrows = ((a.size-L)//S)+1
n = a.strides[0]
return np.lib.stride_tricks.as_strided(a, shape=(nrows,L), strides=(S*n,n))
# From this post : http://stackoverflow.com/a/14314054/3293881 by #Jaime
def moving_average(a, n=3) :
ret = np.cumsum(a, dtype=float)
ret[n:] = ret[n:] - ret[:-n]
return ret[n - 1:] / n
def mad_numpy(a, W):
a2D = strided_app(a,W,1)
return np.absolute(a2D - moving_average(a,W)[:,None]).mean(1)
Runtime test -
In [617]: data = np.random.randint(0,9,(10000))
...: df = pd.DataFrame(data)
...:
In [618]: pandas_out = pd.rolling_apply(df,90,mad).values.ravel()
In [619]: numpy_out = mad_numpy(data,90)
In [620]: np.allclose(pandas_out[89:], numpy_out) # Nans part clipped
Out[620]: True
In [621]: %timeit pd.rolling_apply(df,90,mad)
10 loops, best of 3: 111 ms per loop
In [622]: %timeit mad_numpy(data,90)
100 loops, best of 3: 3.4 ms per loop
In [623]: 111/3.4
Out[623]: 32.64705882352941
Huge 32x+ speedup there over the loopy pandas solution!

Related

Runge-Kutta curve fitting extremely slow

I am currently trying to do a regression of a function calculated via a RK4 method performed on a non-linear Volterra integral of the second kind. The problem I found is that the code is extremely slow, for 1 call of the curve_fit function (fitt), it takes about 30-40 minute to generate a data. Overall, there will be a lot of calls to fitt before the parameters are determined, this takes more than 6 hours to run. Is there anyway to optimize this code? Thanks in advance!
from scipy.special import gamma
from ml_internal import LTInversion
from scipy.optimize import curve_fit , fsolve
from scipy.misc import derivative
from sklearn.metrics import r2_score
from math import comb , factorial
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
# Gets the data
df = pd.read_excel('D:\\CoMat\\Fractional_fit\\optimized\\data_optimized.xlsx')
skipTime = 1
skipIndex = df[df['Time']== skipTime].index.values[0]
xls = pd.read_excel('D:\\CoMat\\Fractional_fit\\optimized\\data_optimized.xlsx',skiprows=np.arange(1,skipIndex+1,1))
timeDF = xls['Time']
tempDF = xls['Temp']
taDF = xls['Ta']
timeDF = timeDF - timeDF[0]
tempDF = tempDF + 273.15
t0 = tempDF[0]
ta = sum(taDF)/len(taDF)
ta = ta + 273.15
###########################################
#Spliting into intervals
h = 0.05
a = 0
b = timeDF[len(timeDF)-1]
N = int(np.round((b-a)/h))
#Each xi
def xidx(index):
return a + h*index
#Function in the image are written here.
def gx(t,lamda,alpha):
return t0 * ml(lamda*(t**alpha),alpha)
gx = np.vectorize(gx)
def kernel(t,s,rad,lamda,alpha,beta):
if t == s:
return 0
return (t-s)**(alpha-1) * ml_(lamda*((t-s)**alpha),alpha,alpha,1) * (beta*(rad**4) - beta*(ta**4) - lamda*ta)
kernel = np.vectorize(kernel)
############################
# The problem is here!!!!!!
def fx(x,n,lamda,alpha,beta):
ans = gx(x,lamda,alpha)
for j in range(n):
ans += (h/6)*(kernel(x,xidx(j),f0[j],lamda,alpha,beta) + 2*kernel(x,xidx(j+1/2),f1[j],lamda,alpha,beta) + 2*kernel(x,xidx(j+1/2),f2[j],lamda,alpha,beta) + kernel(x,xidx(j+1),f3[j],lamda,alpha,beta))
return ans
#########################
f0 = np.zeros(N+1)
f0[0] = t0
f1 = np.zeros(N+1)
f2 = np.zeros(N+1)
f3 = np.zeros(N+1)
F = np.zeros((3,N+1))
def fitt(xvalue,lamda,alpha,beta):
global f0,f1,f2,f3,F
n = int(np.round(xvalue/h))
f1[n] = fx(xidx(n) + 1/2,n,lamda,alpha,beta) + (h/2)*kernel(xidx(n + 1/2),xidx(n),f0[n],lamda,alpha,beta)
f2[n] = fx(xidx(n + 1/2),n,lamda,alpha,beta)
f3[n] = fx(xidx(n+1),n,lamda,alpha,beta) + h*kernel(xidx(n+1),xidx(n+1/2),f2[n],lamda,alpha,beta)
if n+1 <= N:
f0[n+1] = fx(xidx(n+1),n,lamda,alpha,beta) + (h/6)*(kernel(xidx(n+1),xidx(n),f0[n],lamda,alpha,beta) + 2*kernel(xidx(n+1),xidx(n+1/2),f1[n],lamda,alpha,beta) + 2*kernel(xidx(n+1),xidx(n+1/2),f2[n],lamda,alpha,beta) + kernel(xidx(n+1),xidx(n+1),f3[n],lamda,alpha,beta))
if(xvalue == timeDF[len(timeDF) - 1]):
print(f0[n],n)
returnValue = f0[n]
f0 = np.zeros(N+1)
f0[0] = t0
f1 = np.zeros(N+1)
f2 = np.zeros(N+1)
f3 = np.zeros(N+1)
return returnValue
print(f0[n],n)
return f0[n]
fitt = np.vectorize(fitt)
#Fitting, plotting and giving (Adj) R-squared
popt , pcov = curve_fit(fitt,timeDF,tempDF,p0=(-0.1317,0.95,-1e-11),bounds=((-np.inf,0,-np.inf),(0,1,0)))
print(popt)
y_fit = np.array(fitt(timeDF,popt[0],popt[1],popt[2]))
plt.scatter(timeDF,tempDF,color='ORANGE',marker='.',s=0.5)
plt.fill_between(timeDF,tempDF-0.5,tempDF+0.5,color='ORANGE', alpha=0.2)
plt.plot(timeDF,y_fit,color='RED',linewidth=1)
plt.legend(["Experimental data", "Caputo fit"], loc ="upper right")
plt.xlabel("Time (min)")
plt.ylabel("Temperature (Kelvin)")
plt.show()
plt.close()
r2 = r2_score(tempDF,y_fit)
print(r2)
adjr2 = 1 - (1 - r2)*((len(xls)-1)/(len(xls)-3-1))
print(adjr2)
I already tried computing the values f0,f1,f2,f3 all at once, but the thing consuming the most time is Fn(x) which I haven't figured in out how to compute them all at once. If this is possible to compute at once, I think the program will run much faster. PS: ML,ML_ is a function from https://github.com/khinsen/mittag-leffler.
This is the function necesssary. Fn is the only one I haven't figured out yet.
There are two typing errors in the cited image. The combination of x_n and 1/2 is always meant to be the midpoint x_{n+1/2} = x_n + h/2. The second error is a duplication of x_{n+1/2} in the formula for f^{(4)}_n in its third term. The first error is probably producing errors that are large enough to make convergence complicated and any limit wrong for the intended problem.
In the Simpson/RK4 step, the 4 fx computations can be reduced to 2.
The F_n implement the left side of the integral equation
F(x) = g(x) + int(s=0 to x of K(x,s,f(s))
where the integral is approximated with the sample sequences f0,...,f3. Due to the structure of problem and algorithm F_n(x_n)=f^0_n = f^4_{n-1}.
Note that K(x,s,f) should be set to zero for s >= x. In the exact version of the equation these values "above the diagonal" are not used.
If an increase in accuracy is needed, for instance to avoid divergence where there is none in the exact solution, you can decrease the step site by a factor of 10 and then sub-sample the f^0_n sequence to produce the numerical guess for the given data. Other factors than 10 are of course also possible.

how to vectorize this function

The following code works, but I would like to create Z by vectorization. How to achieve that?
import numpy as np
from numpy import sqrt
from math import fsum
points = np.array([[0,0],\
[5,-1],\
[4,6],\
[1,3]])
d = lambda x: fsum([sqrt((x[0]-z[0])**2 + (x[1]-z[1])**2) for z in points])
x = np.linspace(min(points[:,0]),max(points[:,0]),100)
y = np.linspace(min(points[:,1]),max(points[:,1]),100)
X, Y = np.meshgrid(x,y)
Z = np.zeros(np.shape(X))
for (i,j),_ in np.ndenumerate(Z):
Z[i,j] = d([X[i,j],Y[i,j]])
#Z=d([X,Y]) #this fails
We can leverage broadcasting to work directly with the 1D versions and thus be more memory efficient and give ourselves a vectorized one-liner, like so -
Z = np.sqrt((x[:,None] - points[:,0])**2 + (y[:,None,None] - points[:,1])**2).sum(2)
Timings on posted sample data -
In [80]: %%timeit
...: X, Y = np.meshgrid(x,y)
...: Z = np.zeros(np.shape(X))
...: for (i,j),_ in np.ndenumerate(Z):
...: Z[i,j] = d([X[i,j],Y[i,j]])
10 loops, best of 3: 101 ms per loop
In [81]: %timeit ((x[:,None] - points[:,0])**2 + (y[:,None,None] - points[:,1])**2).sum(2)
1000 loops, best of 3: 246 µs per loop
400x speedup there!

Kernel Restarting The kernel appears to have died & dst tensor is not initialized

I'm using Tensorflow (GPU) to fit a CNN model (the total input datasize is only 9.8MB(np array form) and I'm on Windows 10 (Kaby Lake), Tensorflow GPU mode, Geforce GTX 1050, RAM 32GB.
Each time I try running this below piece of code, it either ends the kernel or throws up the error "dst tensor is not initialized". This code seems to be executable by others with relatively lower computing power than mine but I'm not sure how to get it to work.
I am able to run the below code on Tensorflow CPU mode without any problem (but it takes almost 12 hours to finish running it, especially with the epoch is set to more than just 3). That's why I need to run it using my GPU for faster execution.
import tensorflow as tf
import numpy as np
IMG_PX_SIZE = 50
HM_SLICES = 20
n_classes = 2
x = tf.placeholder('float')
y = tf.placeholder('float')
keep_rate = 0.8
keep_prob = tf.placeholder(tf.float32)
def conv3d(x, W):
return tf.nn.conv3d(x, W, strides=[1,1,1,1,1], padding='SAME')
def maxpool3d(x):
return tf.nn.max_pool3d(x, ksize=[1,2,2,2,1], strides=[1,2,2,2,1],
padding='SAME')
def convolutional_neural_network(x):
weights = {'W_conv1':tf.Variable(tf.random_normal([3,3,3,1,32])),
'W_conv2':tf.Variable(tf.random_normal([3,3,3,32,64])),
'W_fc':tf.Variable(tf.random_normal([62720 ,1024])),
'out':tf.Variable(tf.random_normal([1024, n_classes]))}
biases = {'b_conv1':tf.Variable(tf.random_normal([32])),
'b_conv2':tf.Variable(tf.random_normal([64])),
'b_fc':tf.Variable(tf.random_normal([1024])),
'out':tf.Variable(tf.random_normal([n_classes]))}
x = tf.reshape(x, shape=[-1, IMG_PX_SIZE, IMG_PX_SIZE, HM_SLICES, 1])
conv1 = tf.nn.relu(conv3d(x, weights['W_conv1']) + biases['b_conv1'])
conv1 = maxpool3d(conv1)
conv2 = tf.nn.relu(conv3d(conv1, weights['W_conv2']) + biases['b_conv2'])
conv2 = maxpool3d(conv2)
fc = tf.reshape(conv2,[-1, 62720 ])
fc = tf.nn.relu(tf.matmul(fc, weights['W_fc'])+biases['b_fc'])
fc = tf.nn.dropout(fc, keep_rate)
output = tf.matmul(fc, weights['out']) + biases['out']
return output
def train_neural_network(x):
much_data = np.load('muchdata_sampled-50-50-20.npy')
train_data = much_data[:100]
validation_data = much_data[-100:]
prediction = convolutional_neural_network(x)
cost = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(logits=prediction,labels=y) )
optimizer = tf.train.AdamOptimizer().minimize(cost)
hm_epochs = 3
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(hm_epochs):
epoch_loss = 0
for data in train_data:
X = data[0]
Y = data[1]
_, c = sess.run([optimizer, cost], feed_dict={x: X, y: Y})
epoch_loss += c
print('Epoch', epoch, 'completed out of',hm_epochs,'loss:',epoch_loss)
correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
print('Accuracy:',accuracy.eval({x:[i[0] for i in validation_data], y:[i[1]
for i in validation_data]}))
train_neural_network(x)
Please kindly provide some help as I'm stuck with this for sometime now. My only tip is to feed the data in batches instead of the whole thing into CNN, but I'm not successful with that technique yet. Could someone please point out a way ?

OpenCV 3.1 optimization

I'm currently trying to implement an algorithm from a paper with OpenCV 3.1 on python 2.7 but the process is taking way too long.
The section of my code that's giving me trouble looks something like this:
width, height = mr.shape[:2]
Pm = []
for i in d:
M = np.float32([[1,0,-d[i]], [0,1,1]])
mrd = cv2.warpAffine(mr, M, (height,width))
C = cv2.subtract(ml, mrd)
C = cv2.pow(C,2)
C = np.divide(C, sigma_m)
C = p0 + (1-p0)**(-C)
Pm.append(C)
Where ml, mr and mrd are cv2 objects and d, p0 and sigma_m are integers.
The division and final equation in the last 3 lines are the real troublemakers here. Every iteration of this cycle is independent so in theory I could just split the 'for loop' through a few processors, but that seems like a lazy approach where I would just bypass the problem instead of fixing it.
Does anyone know a way to perform those computations faster?
We can leverage numexpr module to efficiently perform all of those latter arithmetic operations as one evaluate expression.
Thus, these steps :
C = cv2.subtract(ml, mrd)
C = cv2.pow(C,2)
C = np.divide(C, sigma_m)
C = p0 + (1-p0)**(-C)
could be replaced by one expression -
import numexpr as ne
C = ne.evaluate('p0 +(1-p0)**(-((ml-mrd)**2)/sigma_m)')
Let's verify things. The original approach as func -
def original_app(ml, mrd, sigma_m, p0):
C = cv2.subtract(ml, mrd)
C = cv2.pow(C,2)
C = np.divide(C, sigma_m)
C = p0 + (1-p0)**(-C)
return C
Verification -
In [28]: # Setup inputs
...: S = 1024 # Size parameter
...: ml = np.random.randint(0,255,(S,S))/255.0
...: mrd = np.random.randint(0,255,(S,S))/255.0
...: sigma_m = 0.45
...: p0 = 0.56
...:
In [29]: out1 = original_app(ml, mrd, sigma_m, p0)
In [30]: out2 = ne.evaluate('p0 +(1-p0)**(-((ml-mrd)**2)/sigma_m)')
In [31]: np.allclose(out1, out2)
Out[31]: True
Timings across various sizes of datasets -
In [19]: # Setup inputs
...: S = 1024 # Size parameter
...: ml = np.random.randint(0,255,(S,S))/255.0
...: mrd = np.random.randint(0,255,(S,S))/255.0
...: sigma_m = 0.45
...: p0 = 0.56
...:
In [20]: %timeit original_app(ml, mrd, sigma_m, p0)
10 loops, best of 3: 67.1 ms per loop
In [21]: %timeit ne.evaluate('p0 +(1-p0)**(-((ml-mrd)**2)/sigma_m)')
100 loops, best of 3: 12.9 ms per loop
In [22]: # Setup inputs
...: S = 512 # Size parameter
In [23]: %timeit original_app(ml, mrd, sigma_m, p0)
100 loops, best of 3: 15.3 ms per loop
In [24]: %timeit ne.evaluate('p0 +(1-p0)**(-((ml-mrd)**2)/sigma_m)')
100 loops, best of 3: 3.39 ms per loop
In [25]: # Setup inputs
...: S = 256 # Size parameter
In [26]: %timeit original_app(ml, mrd, sigma_m, p0)
100 loops, best of 3: 3.65 ms per loop
In [27]: %timeit ne.evaluate('p0 +(1-p0)**(-((ml-mrd)**2)/sigma_m)')
1000 loops, best of 3: 878 µs per loop
Around 5x speedup across various sizes with better speedups for larger arrays!
Also, as a side-note, I would advise using initialized arrays instead of appending as you are doing at the final step. Thus, we could initialize before going into the loop with something like out = np.zeros((len(d), width, height)) / np.empty and at the final step assign into the output array with : out[iteration_ID] = C.

Fastest way to get a hash from a list?

I have a long list of integers that I want to turn into an MD5 hash. What's the quickest way to do this? I have tried two options, both similar. Just wondering if I'm missing an obviously quicker method.
import random
import hashlib
import cPickle as pickle
r = [random.randrange(1, 1000) for _ in range(0, 1000000)]
def method1(r):
p = pickle.dumps(r, -1)
return hashlib.md5(p).hexdigest()
def method2(r):
p = str(r)
return hashlib.md5(p).hexdigest()
def method3(r):
p = ','.join(map(str, r))
return hashlib.md5(p).hexdigest()
Then time it in iPython:
timeit method1(r)
timeit method2(r)
timeit method3(r)
Gives me this:
In [8]: timeit method1(r)
10 loops, best of 3: 68.7 ms per loop
In [9]: timeit method2(r)
10 loops, best of 3: 176 ms per loop
In [10]: timeit method3(r)
1 loops, best of 3: 270 ms per loop
So, option 1 is the best I've got. But I have to do it a lot and it's currently the rate determining step in my code.
Any tips or tricks to get a unique hash from a big list quicker than what's here, using Python 2.7?
You may find this useful. It uses my own custom bench-marking framework (based on timeit) to gather and print the results. Since the variations in speed are primarily due to the need to convert the r list to something that hashlib.md5() can work with, I've updated the suite of test cases to show how storing the values in an array.array instead, as #DSM suggested in a comment, would dramatically speed things up. Note that since the integers in the list are all relatively small I've stored them in an array of short (2-byte) values.
from __future__ import print_function
import sys
import timeit
setup = """
import array
import random
import hashlib
import marshal
import cPickle as pickle
import struct
r = [random.randrange(1, 1000) for _ in range(0, 1000000)]
ra = array.array('h', r) # create an array of shorts equivalent
def method1(r):
p = pickle.dumps(r, -1)
return hashlib.md5(p).hexdigest()
def method2(r):
p = str(r)
return hashlib.md5(p).hexdigest()
def method3(r):
p = ','.join(map(str, r))
return hashlib.md5(p).hexdigest()
def method4(r):
fmt = '%dh' % len(r)
buf = struct.pack(fmt, *r)
return hashlib.md5(buf).hexdigest()
def method5(r):
a = array.array('h', r)
return hashlib.md5(a).hexdigest()
def method6(r):
m = marshal.dumps(r)
return hashlib.md5(m).hexdigest()
# using pre-built array...
def pb_method1(ra):
p = pickle.dumps(ra, -1)
return hashlib.md5(p).hexdigest()
def pb_method2(ra):
p = str(ra)
return hashlib.md5(p).hexdigest()
def pb_method3(ra):
p = ','.join(map(str, ra))
return hashlib.md5(p).hexdigest()
def pb_method4(ra):
fmt = '%dh' % len(ra)
buf = struct.pack(fmt, *ra)
return hashlib.md5(buf).hexdigest()
def pb_method5(ra):
return hashlib.md5(ra).hexdigest()
def pb_method6(ra):
m = marshal.dumps(ra)
return hashlib.md5(m).hexdigest()
"""
statements = {
"pickle.dumps(r, -1)": """
method1(r)
""",
"str(r)": """
method2(r)
""",
"','.join(map(str, r))": """
method3(r)
""",
"struct.pack(fmt, *r)": """
method4(r)
""",
"array.array('h', r)": """
method5(r)
""",
"marshal.dumps(r)": """
method6(r)
""",
# versions using pre-built array...
"pickle.dumps(ra, -1)": """
pb_method1(ra)
""",
"str(ra)": """
pb_method2(ra)
""",
"','.join(map(str, ra))": """
pb_method3(ra)
""",
"struct.pack(fmt, *ra)": """
pb_method4(ra)
""",
"ra (pre-built)": """
pb_method5(ra)
""",
"marshal.dumps(ra)": """
pb_method6(ra)
""",
}
N = 10
R = 3
timings = [(
idea,
min(timeit.repeat(statements[idea], setup=setup, repeat=R, number=N)),
) for idea in statements]
longest = max(len(t[0]) for t in timings) # length of longest name
print('fastest to slowest timings (Python {}.{}.{})\n'.format(*sys.version_info[:3]),
' ({:,d} calls, best of {:d})\n'.format(N, R))
ranked = sorted(timings, key=lambda t: t[1]) # sort by speed (fastest first)
for timing in ranked:
print("{:>{width}} : {:.6f} secs, rel speed {rel:>8.6f}x".format(
timing[0], timing[1], rel=timing[1]/ranked[0][1], width=longest))
Results:
fastest to slowest timings (Python 2.7.6)
(10 calls, best of 3)
ra (pre-built) : 0.037906 secs, rel speed 1.000000x
marshal.dumps(ra) : 0.177953 secs, rel speed 4.694626x
marshal.dumps(r) : 0.695606 secs, rel speed 18.350932x
pickle.dumps(r, -1) : 1.266096 secs, rel speed 33.401179x
array.array('h', r) : 1.287884 secs, rel speed 33.975950x
pickle.dumps(ra, -1) : 1.955048 secs, rel speed 51.576558x
struct.pack(fmt, *r) : 2.085602 secs, rel speed 55.020743x
struct.pack(fmt, *ra) : 2.357887 secs, rel speed 62.203962x
str(r) : 2.918623 secs, rel speed 76.996860x
str(ra) : 3.686666 secs, rel speed 97.258777x
','.join(map(str, r)) : 4.701531 secs, rel speed 124.032173x
','.join(map(str, ra)) : 4.968734 secs, rel speed 131.081303x
You can improve performance slightly, simplify your code, and remove an import by using Python's builtin hash function instead of md5 from hashlib:
import random
import cPickle as pickle
r = [random.randrange(1, 1000) for _ in range(0, 1000000)]
def method1(r):
p = pickle.dumps(r, -1)
return hash(p)
def method2(r):
p = str(r)
return hash(p)
def method3(r):
p = ','.join(map(str, r))
return hash(p)

Resources