I am trying to create an affine term structure model derived from statsmodels.tsa.statespace.MLEModel (code below) which is initialized using least squares.
class affine_term_structure(sm.tsa.statespace.MLEModel):
def __init__(self, yields, tau, k_states=3, **kwargs):
# Initialize the statespace
super(affine_term_structure, self).__init__(yields, k_states=k_states, **kwargs)
self.initialize_known(np.zeros(self.k_states), np.eye(self.k_states) * 10000)
def update(self, params, **kwargs):
params = super(dynamic_nelson_siegel, self).update(params, **kwargs)
# Extract the parameters
Phi = np.reshape(params[:9], (3, 3))
k = np.array(params[9:12])
Sx = np.zeros((3, 3))
Sx[np.tril_indices(3)] = params[12:18]
lmbd = params[18]
sy = params[-1]
b = self.nss(self.tau, lmbd)
self['transition'] = Phi # transition matrix
self['state_intercept'] = k # transition offset
self['state_cov'] = Sx # Sx.T # transition covariance. 3x3 SPD matrix
self['design'] = b # design matrix
# self['obs_intercept'] = 0 # observation intercept
self['obs_cov'] = sy * sy * np.eye(self.k_endog) # observation covariance
However, I noticed that on running the filter/smoother the states were being excessively smoothed. Digging through the filtering results it seems like the state_cov is not being used in the prediction step.
For example
self.transition[:,:,0] # self.filtered_state_cov[:,:,0] # self.transition[:,:,0].T
Though I would have expected it to be equal to
self.transition[:,:,0] # self.filtered_state_cov[:,:,0] # self.transition[:,:,0].T + self.state_cov[:,:,0]
For good order, please note that all parameter matrices are time invariant.
Im not sure what Im missing here and any help would be much appreciated.
In Statsmodels, the state equation is:
x(t+1) = T x(t) + R eta(t+1)
where eta(t+1) ~ N(0, Q)
When you set state_cov, you're setting Q, but you also need to set R, which is selection.
For example, if you want your state equation to be:
x(t+1) = T x(t) + eta(t+1)
Then you would do something like:
self['selection'] = np.eye(3)
It is not the case that R is the identity in every state space model, and it can't even always be initialized to the identity matrix, since the dimension of x(t) and the dimension of eta(t) can be different. That's why R is not automatically initialized to the identity matrix.
im really new for gpu coding i found this Kmeans cupy code my propouse is work with a large data base (n,3) for example to realize about the timing difference on gpu and cpu , i wanna have a huge number of clusters but i am getting a memory management error. Can someone give me the route I should take to research and fix it, i already research but i have not a clear start yet.
import contextlib
import time
import cupy
import matplotlib.pyplot as plt
import numpy
def timer(message):
start = time.time()
end = time.time()
print('%s: %f sec' % (message, end - start))
var_kernel = cupy.ElementwiseKernel(
'T x0, T x1, T c0, T c1', 'T out',
'out = (x0 - c0) * (x0 - c0) + (x1 - c1) * (x1 - c1)',
sum_kernel = cupy.ReductionKernel(
'T x, S mask', 'T out',
'mask ? x : 0',
'a + b', 'out = a', '0',
count_kernel = cupy.ReductionKernel(
'T mask', 'float32 out',
'mask ? 1.0 : 0.0',
'a + b', 'out = a', '0.0',
def fit_xp(X, n_clusters, max_iter):
assert X.ndim == 2
# Get NumPy or CuPy module from the supplied array.
xp = cupy.get_array_module(X)
n_samples = len(X)
# Make an array to store the labels indicating which cluster each sample is
# contained.
pred = xp.zeros(n_samples)
# Choose the initial centroid for each cluster.
initial_indexes = xp.random.choice(n_samples, n_clusters, replace=False)
centers = X[initial_indexes]
for _ in range(max_iter):
# Compute the new label for each sample.
distances = xp.linalg.norm(X[:, None, :] - centers[None, :, :], axis=2)
new_pred = xp.argmin(distances, axis=1)
# If the label is not changed for each sample, we suppose the
# algorithm has converged and exit from the loop.
if xp.all(new_pred == pred):
pred = new_pred
# Compute the new centroid for each cluster.
i = xp.arange(n_clusters)
mask = pred == i[:, None]
sums = xp.where(mask[:, :, None], X, 0).sum(axis=1)
counts = xp.count_nonzero(mask, axis=1).reshape((n_clusters, 1))
centers = sums / counts
return centers, pred
def fit_custom(X, n_clusters, max_iter):
assert X.ndim == 2
n_samples = len(X)
pred = cupy.zeros(n_samples,dtype='float32')
initial_indexes = cupy.random.choice(n_samples, n_clusters, replace=False)
centers = X[initial_indexes]
for _ in range(max_iter):
distances = var_kernel(X[:, None, 0], X[:, None, 1],
centers[None, :, 1], centers[None, :, 0])
new_pred = cupy.argmin(distances, axis=1)
if cupy.all(new_pred == pred):
pred = new_pred
i = cupy.arange(n_clusters)
mask = pred == i[:, None]
sums = sum_kernel(X, mask[:, :, None], axis=1)
counts = count_kernel(mask, axis=1).reshape((n_clusters, 1))
centers = sums / counts
return centers, pred
def draw(X, n_clusters, centers, pred, output):
# Plot the samples and centroids of the fitted clusters into an image file.
for i in range(n_clusters):
labels = X[pred == i]
plt.scatter(labels[:, 0], labels[:, 1], c=numpy.random.rand(3))
centers[:, 0], centers[:, 1], s=120, marker='s', facecolors='y',
def run_cpu(gpuid, n_clusters, num, max_iter, use_custom_kernel):##, output
samples = numpy.random.randn(num, 3)
X_train = numpy.r_[samples + 1, samples - 1]
with timer(' CPU '):
centers, pred = fit_xp(X_train, n_clusters, max_iter)
def run_gpu(gpuid, n_clusters, num, max_iter, use_custom_kernel):##, output
samples = numpy.random.randn(num, 3)
X_train = numpy.r_[samples + 1, samples - 1]
with cupy.cuda.Device(gpuid):
X_train = cupy.asarray(X_train)
with timer(' GPU '):
if use_custom_kernel:
centers, pred = fit_custom(X_train, n_clusters, max_iter)
centers, pred = fit_xp(X_train, n_clusters, max_iter)
btw i am working in colab pro 25GB(RAM), the code is working with n_clusters=200 and num= 1000000 but if i use bigger numbers the error appear, i am running the code like this:
This is the error that i have
Any suggestion will be welcome, thanks for your time.
Assuming that CuPy is clever enough not to create explicit copies of the broadcasted input of var_kernel, the output distances has to have a size of 2 * num * num_clusters which are exactly the 6,400,000,000 Bytes it is trying to allocate. You could have a way smaller memory footprint by never actually writing the distances to memory which means fusing the var_kernel with argmin. See this part of the docs.
If I understand the example there correctly, this should work:
def argmin_distance(x1, y1, x2, y2):
return cupy.argmin((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2), axis = 1)
The next question would be where the other 13.7GB come from. A big part of them might just be the instances of distances from earlier iterations. I'm not a CuPy expert, but at least in Python/Numpy your use of distances inside the loop would not reuse the same memory, but allocate more memory each time you call the var_kernel. The same problem is visible with pred which is allocated before the loop. If CuPy does things the Numpy way, the solution would be to just put [:] in there like
pred[:] = new_pred
distances[:,:,:] = var_kernel(X[:, None, 0], X[:, None, 1],
centers[None, :, 1], centers[None, :, 0])
For this to work, you need to allocate distances before the loop as well. Also this isn't needed anymore when using kernel fusion, so just take it as an example. It may be best to allocate everything beforehand and then use this syntax everywhere in the loop.
I don't know enough about CuPy to answer why fit_xp doesn't have the same problem (or does it?). But my guess would be that garbage collection with CuPy objects works differently there. If garbage collection were "quick enough" in fit_custom it should work even without kernel fusion or reusing already allocated arrays.
Other problems or at least oddities with your code:
Why are you comparing the zeroth coordinate of centers with the first coordinate of X? Wouldn't it make more sense to call
distances = var_kernel(X[:, None, 0], X[:, None, 1],
centers[None, :, 0], centers[None, :, 1])
Why are you creating 3D data when only using the projection on the 2D plane? So why not
samples = numpy.random.randn(num, 2)
Why are you using floats for (the initial version of) pred? The argmin should give an integer type result.
I like to constrain the variable value u < 1 in y model. Added ub=1 to the variable definition u = m.Var(name='u', value=0, lb=-2, ub=1) but it resulted in "No soulution found" (EXIT: Converged to a point of local infeasibility. Problem may be infeasible.). I guess I have to reformulate the problem to avoid this, but I have not been able to find examples how this should be done. How do i write a proper model to avoid infeasible solutions when constraining variable values?
I hav tied to reformulate the problem by adding equation like m.Equation(u < 1) with no success.
import numpy as np
from gekko import GEKKO
import matplotlib.pyplot as pyplt
m = GEKKO(remote=False)
t = np.linspace(0, 1000, 101) # time
d = np.ones(t.shape)
d[0:10] = 0
# Add data to model
m.time = t
K = m.Const(0.01, name='K')
r = m.Const(name='r', value=0) # Reference
d = m.Param(name='d', value=d) # Disturbance
y = m.Var(name='y', value=0, lb=-2, ub=2) # State variable
u = m.Var(name='u', value=0, lb=-2, ub=1) # Output
e = m.Var(name='e', value=0)
Tc = m.FV(name='Tc', value=1200, lb=60, ub=1200) # time constant
# Update variable status
Tc.STATUS = 1 # Optimizer can adjust value
Kp = m.Intermediate(1 / K * 1 / Tc, name='Kp')
Ti = m.Intermediate(4 * Tc, name='Ti')
# Model equations
m.Equations([y.dt() == K * (u-d),
e == r-y,
u.dt() == Kp*e.dt()+Kp/Ti*e])
# Model constraints
m.Equation(y < 0.5)
m.Equation(y > -0.5)
# Model objective
# options
m.options.IMODE = 6 # Problem type: 6 = Dynamic optimization
# solve
m.solve(disp=True, debug=True)
print('Tc: %6.2f [s]' % (Tc.value[-1], ))
fig1, (ax1, ax2, ax3) = pyplt.subplots(3, sharex='all')
ax1.plot(t, y.value)
ax1.set_ylabel("y", fontsize=8), ax1.grid(True, which='both')
ax2.plot(t, e.value)
ax2.set_ylabel("e", fontsize=8), ax2.grid(True, which='both')
ax3.plot(t, u.value)
ax3.plot(t, d.value)
ax3.set_ylabel("u and d", fontsize=8), ax3.grid(True, which='both')
EXIT: Converged to a point of local infeasibility. Problem may be infeasible.
An error occured.
The error code is 2
If I change the upper bound of u to 2, the optimization problem is solved as expected.
Hard constraints on variables can lead to an infeasible solution, as you observed. I recommend that you use soft constraints by specifying the variable y as a Controlled Variable and set an upper and lower set point range with SPHI and SPLO.
y = m.CV(name='y', value=0) # Controlled variable
y.STATUS = 1
y.TR_INIT = 0
y.SPHI = 0.5
y.SPLO = -0.5
I also removed the lb and ub from y and u to not give them hard bounds that can lead to the infeasibility. You also have an objective to maximize the value of Tc with m.Obj(-Tc). It goes to the maximum limit: 1200 when the solver is able to adjust the value. As you can see from the plot, the value of y exceeds the setpoint range. It may not be possible for the controller to keep it within that range. A soft constraint (objective based) approach to constraints penalizes deviations but does not lead to an infeasible solution. If you need to increase the penalty on violations of the SPHI or SPLO, the parameters WSPHI and WSPLO can be adjusted.
It appears that you have a first order dynamic model and you are trying to optimize PID parameters. If you need to model saturation of the controller output (actuator) then the if3, max3, min3 or corresponding if2, max2, min2 functions may be useful. There is more information on CV objectives and tuning in the Dynamic Optimization course.
Here is a feasible solution to your problem:
import numpy as np
from gekko import GEKKO
import matplotlib.pyplot as pyplt
m = GEKKO() # remote=False
t = np.linspace(0, 1000, 101) # time
d = np.ones(t.shape)
d[0:10] = 0
# Add data to model
m.time = t
K = m.Const(0.01, name='K')
r = m.Const(name='r', value=0) # Reference
d = m.Param(name='d', value=d) # Disturbance
e = m.Var(name='e', value=0)
u = m.Var(name='u', value=0) # Output
Tc = m.FV(name='Tc', value=1200, lb=60, ub=1200) # time constant
y = m.CV(name='y', value=0) # Controlled variable
y.STATUS = 1
y.TR_INIT = 0
y.SPHI = 0.5
y.SPLO = -0.5
# Update variable status
Tc.STATUS = 1 # Optimizer can adjust value
Kp = m.Intermediate((1 / K) * (1 / Tc), name='Kp')
Ti = m.Intermediate(4 * Tc, name='Ti')
# Model equations
m.Equations([y.dt() == K * (u-d),
e == r-y,
u.dt() == Kp*e.dt()+(Kp/Ti)*e])
# Model constraints
#m.Equation(y < 0.5)
#m.Equation(y > -0.5)
# Model objective
# options
m.options.IMODE = 6 # Problem type: 6 = Dynamic optimization
m.options.SOLVER = 3
m.options.MAX_ITER = 1000
# solve
m.solve(disp=True, debug=True)
print('Tc: %6.2f [s]' % (Tc.value[-1], ))
fig1, (ax1, ax2, ax3) = pyplt.subplots(3, sharex='all')
ax1.plot(t, y.value)
ax1.set_ylabel("y", fontsize=8), ax1.grid(True, which='both')
ax2.plot(t, e.value)
ax2.set_ylabel("e", fontsize=8), ax2.grid(True, which='both')
ax3.plot(t, u.value)
ax3.plot(t, d.value)
ax3.set_ylabel("u and d", fontsize=8), ax3.grid(True, which='both')
Thanks for an extensive and useful answer to my question. I really appreciate it.
As you correctly observed I am trying to optimize tuning parameters for my simple control problem. I have executed your code with soft constraints, and it sure solves the feasibility issue. I also added the WSPHI/LO parameters and set their values high to have a solution within the constraints. Still, I like to have a model where the control output (“u”) is bounded [0,1]. Based on your answer I probably must add “if” or “max/min” statements in the model to avoid having a non-feasible set of equations when “u” hits the bound. Something like “if u<0, u.dt() = 0 else u.dt() = Kp*e ….”. Could it alternatively be possible to add a variable (a type slack variable) to ensure feasibility of the equation set? I will also investigate the material in the dynamic optimization course links to get a better understanding of dynamic modelling. Thanks again for guiding me in the right direction in this issue.
I have a fairly simple test data set I am trying to fit with pymc3.
The result generated by traceplot looks something like this.
Essentially the trace of all parameter look like there is a standard 'caterpillar' for 100 iterations, followed by a flat line for 750 iterations, followed by the caterpillar again.
The initial 100 iterations happen after 25,000 ADVI iterations, and 10,000 tune iterations. If I change these amounts, I randomly will/won't have these periods of unwanted stability.
I'm wondering if anyone has any advice about how I can stop this from happening - and what is causing it?
The full code is below. In brief, I am generating a set of 'phases' (-pi -> pi) with a corresponding set of values y = a(j)*sin(phase) + b(j)*sin(phase). a and b are drawn for each subject j at random, but are related to each other.
I then essentially try to fit this same model.
Edit: Here is a similar example, running for 25,000 iterations. Something goes wrong around iteration 20,000.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import numpy as np
import pymc3 as pm
%matplotlib inline
n_draw = 2000
n_tune = 10000
n_init = 25000
init_string = 'advi'
target_accept = 0.95
# Generate some test data
# Just generates:
# x a vector of phases
# y a vector corresponding to some sinusoidal function of x
# subject_idx a vector corresponding to which subject x is
#9 Subjects
N_j = 9
#Each with 276 measurements
N_i = 276
sigma_y = 1.0
mean = [0.1, 0.1]
cov = [[0.01, 0], [0, 0.01]] # diagonal covariance
x_sub = np.zeros((N_j,N_i))
y_sub = np.zeros((N_j,N_i))
y_true_sub = np.zeros((N_j,N_i))
ab_sub = np.zeros((N_j,2))
tuning_sub = np.zeros((N_j,1))
sub_ix_sub = np.zeros((N_j,N_i))
for j in range(0,N_j):
aj,bj = np.random.multivariate_normal(mean, cov)
#aj = np.abs(aj)
#bj = np.abs(bj)
xj = np.random.uniform(-1,1,size = (N_i,1))*np.pi
xj = np.sort(xj)#for convenience
yj_true = aj*np.sin(xj) + bj*np.cos(xj)
yj = yj_true + np.random.normal(scale=sigma_y, size=(N_i,1))
x_sub[j,:] = xj.ravel()
y_sub[j,:] = yj.ravel()
y_true_sub[j,:] = yj_true.ravel()
ab_sub[j,:] = [aj,bj]
tuning_sub[j,:] = np.sqrt(((aj**2)+(bj**2)))
sub_ix_sub[j,:] = [j]*N_i
x = np.ravel(x_sub)
y = np.ravel(y_sub)
subject_idx = np.ravel(sub_ix_sub)
subject_idx = np.asarray(subject_idx, dtype=int)
# Fit model
hb1_model = pm.Model()
with hb1_model:
# Hyperpriors
hb1_mu_a = pm.Normal('hb1_mu_a', mu=0., sd=100)
hb1_sigma_a = pm.HalfCauchy('hb1_sigma_a', 4)
hb1_mu_b = pm.Normal('hb1_mu_b', mu=0., sd=100)
hb1_sigma_b = pm.HalfCauchy('hb1_sigma_b', 4)
# We fit a mixture of a sine and cosine with these two coeffieicents
# allowed to be different for each subject
hb1_aj = pm.Normal('hb1_aj', mu=hb1_mu_a, sd=hb1_sigma_a, shape = N_j)
hb1_bj = pm.Normal('hb1_bj', mu=hb1_mu_b, sd=hb1_sigma_b, shape = N_j)
# Model error
hb1_eps = pm.HalfCauchy('hb1_eps', 5)
hb1_linear = hb1_aj[subject_idx]*pm.math.sin(x) + hb1_bj[subject_idx]*pm.math.cos(x)
hb1_linear_like = pm.Normal('y', mu = hb1_linear, sd=hb1_eps, observed=y)
with hb1_model:
hb1_trace = pm.sample(draws=n_draw, tune = n_tune,
init = init_string, n_init = n_init,
target_accept = target_accept)
To partially answer my own question: After playing with this for a while, it looks like the problem might be due to the hyperprior standard deviation going to 0. I am not sure why the algorithm should get stuck there though (testing a small standard deviation can't be that uncommon...).
In any case, two solutions that seem to alleviate the problem (although they don't remove it entirely) are:
1) Add an offset to the definitions of the standard deviation. e.g.:
offset = 1e-2
hb1_sigma_a = offset + pm.HalfCauchy('hb1_sigma_a', 4)
2) Instead of using a HalfCauchy or HalfNormal for the SD prior, use a logNormal distribution set so that 0 is unlikely.
I'd look at the divergencies, as explained in notes and literature on Hamiltonian Monte Carlo, see, e.g., here and here.
with model:
np.savetxt('diverging.csv', hb1_trace['diverging'])
As a dirty solution, you can try to increase target_accept, perhaps.
Good luck!
I was recently working on a deep learning model in Keras and it gave me very perplexing results. The model is capable of mastering the training data over time, but it consistently gets worse results on the validation data.
I know that if the validation accuracy goes up for a while and then starts to decrease that you are over-fitting to the training data, but in this case, the validation accuracy only ever decreases. I am really confused why this happens. Does anyone have any intuition as to what could cause this to happen? Or any suggestions on things to test to potentially fix it?
Edit to add more info and code
Ok. So I am making a model that is trying to do some basic stock predictions. By looking at the open, high, low, close, and volume of the last 40 days, the model tries to predict whether or not the price will go up two average true ranges without going down one average true range. As input, I took CSVs from Yahoo Finance that include this information for the last 30 years for all of the stocks in the Dow Jones Industrial Average. The model trains on 70% of the stocks and validates on the other 20%. This leads to about 150,000 training samples. I am currently using a 1d Convolutional Neural Network, but I have also tried other smaller models (logistic regression and small Feed Forward NN) and I always get the same either diverging train and validation loss or nothing learned at all because the model is too simple.
Here is the code:
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import auc, roc_curve, roc_auc_score
from keras.layers import Input, Dense, Flatten, Conv1D, Activation, MaxPooling1D, Dropout, Concatenate
from keras.models import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
from keras import backend as K
import matplotlib.pyplot as plt
from random import seed, shuffle
from os import listdir
class roc_auc(Callback):
def on_train_begin(self, logs={}):
self.aucs = []
def on_train_end(self, logs={}):
def on_epoch_begin(self, epoch, logs={}):
def on_epoch_end(self, epoch, logs={}):
y_pred = self.model.predict(self.validation_data[0])
self.aucs.append(roc_auc_score(self.validation_data[1], y_pred))
if max(self.aucs) == self.aucs[-1]:
print(" - auc: %0.4f" % self.aucs[-1])
def on_batch_begin(self, batch, logs={}):
def on_batch_end(self, batch, logs={}):
rrr = 2
epochs = 200
batch_size = 64
days_input = 40
X_train = []
X_test = []
y_train = []
y_test = []
files = listdir("Stocks")
total_stocks = len(files)
for x, file in enumerate(files):
test = False
if (x+1.0)/total_stocks > 0.7:
test = True
if test:
print("Test -> Stocks/%s" % file)
print("Train -> Stocks/%s" % file)
stock = np.loadtxt(open("Stocks/"+file, "r"), delimiter=",", skiprows=1, usecols = (1,2,3,5,6))
atr = []
last = None
for day in stock:
if last is None:
tr = abs(day[1] - day[2])
tr = max(day[1] - day[2], abs(last[3] - day[1]), abs(last[3] - day[2]))
last = day.copy()
stock = np.insert(stock, 5, atr, axis=1)
for i in range(days_input,stock.shape[0]-1):
input = stock[i-days_input:i, 0:5].copy()
for j, day in enumerate(input):
input[j][1] = (day[1]-day[0])/day[0]
input[j][2] = (day[2]-day[0])/day[0]
input[j][3] = (day[3]-day[0])/day[0]
input[:,0] = input[:,0] / np.linalg.norm(input[:,0])
input[:,1] = input[:,1] / np.linalg.norm(input[:,1])
input[:,2] = input[:,2] / np.linalg.norm(input[:,2])
input[:,3] = input[:,3] / np.linalg.norm(input[:,3])
input[:,4] = input[:,4] / np.linalg.norm(input[:,4])
preprocessing.scale(input, copy=False)
output = -1
buy = stock[i][1]
stoploss = buy - stock[i][5]
target = buy + rrr*stock[i][5]
for j in range(i+1, stock.shape[0]):
if stock[j][0] < stoploss or stock[j][2] < stoploss:
output = 0
elif stock[j][1] > target:
output = 1
if output != -1:
if test:
shape = list(X_train[0].shape)
shape[:0] = [len(X_train)]
X_train = np.concatenate(X_train).reshape(shape)
y_train = np.array(y_train)
shape = list(X_test[0].shape)
shape[:0] = [len(X_test)]
X_test = np.concatenate(X_test).reshape(shape)
y_test = np.array(y_test)
print("Train class split is %0.2f" % (100*np.average(y_train)))
print("Test class split is %0.2f" % (100*np.average(y_test)))
inputs = Input(shape=(days_input,5))
x = Conv1D(32, 5, padding='same')(inputs)
x = Activation('relu')(x)
x = MaxPooling1D()(x)
x = Conv1D(64, 5, padding='same')(x)
x = Activation('relu')(x)
x = MaxPooling1D()(x)
x = Conv1D(128, 5, padding='same')(x)
x = Activation('relu')(x)
x = MaxPooling1D()(x)
x = Flatten()(x)
x = Dense(128, activation="relu")(x)
x = Dense(64, activation="relu")(x)
output = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inputs,outputs=output)
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, mode='max')
auc_hist = roc_auc()
callbacks_list = [checkpoint, auc_hist]
history =, y_train, validation_data=(X_test,y_test) , epochs=epochs, callbacks=callbacks_list, batch_size=batch_size, class_weight ='balanced').history
model_json = model.to_json()
with open("model.json", "w") as json_file:
plt.title('model accuracy')
plt.legend(['train', 'test'], loc='upper left')
plt.title('model loss')
plt.legend(['train', 'test'], loc='upper left')
plt.title('model ROC AUC')
y_pred = model.predict(X_train)
fpr, tpr, _ = roc_curve(y_train, y_pred)
roc_auc = auc(fpr, tpr)
plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy',linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Train ROC')
plt.legend(loc="lower right")
y_pred = model.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.subplot(1, 2, 2)
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy',linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Test ROC')
plt.legend(loc="lower right")
with open('roc.csv','w+') as file:
for i in range(len(thresholds)):
file.write("%f,%f,%f\n" % (fpr[i], tpr[i], thresholds[i]))
Results by 100 batches instead of by epoch
I listened to suggestions and made a few updates. The classes are now balanced 50% to 50% instead of 25% to 75%. Also, the validation data is randomly selected now instead of being a specific set of stocks. By graphing the loss and accuracy at a finer resolution(100 batches vs 1 epoch), the over-fitting can clearly be seen. The model does actually start to learn at the very beginning before it starts to diverge. I am surprised at how fast it starts to over-fit, but now that I can see the issue hopefully I can debug it.
Possible explanations
Coding error
Overfitting due to differences in the training / validation data
Skewed classes (and differences in the training / validation data)
Things I would try
Swapping the training and the validation set. Does the problem still occur?
Plot the curves in more detail for the first ~10 epochs (e.g. directly after initialization; each few training iterations, not only per epoch). Do you still start at > 75%? Then your classes might be skewed and you might also want to check if your training-validation split is stratified.
This is useless: np.concatenate(X_train)
Make your code as readable as possible when you post it here. This includes removing lines which are commented out.
This looks suspicious for a coding error to me:
if test:
#if((output == 0 and np.average(y_train) > 0.5) or output == 1):
use sklearn.model_selection.train_test_split instead. Do all transformations to the data before, then make the split with this method.
Looks like the batch size is much too small for the number of training samples you have. Try batching 20% and see if that makes a difference.