I am opening this new thread because I am looking for some to use Dymos, in order to simulate a dynamic system.
Indeed, I am trying to simulate a system which is composing of a pressurized bottle and a fluid inside. When t=0, the pressure is pushing the fluid through the bottle output, and as a result the pressure inside the bottle is decreasing. My aim is to simulate the behaviour of the pressure inside the bottle and the fluid volumic flow which is escaping from the bottle. If found an Dymos example whicih is very similar to what I am trying to do, but more simpler. https://openmdao.github.io/dymos/examples/water_rocket/water_rocket.html
To model my system, I am using two explicit components: the PressureRate, the VolumeFLowRate. Then I am defining the group component PressureModelODE to connect these two last components and their variables.
Here are these components:
class PressureRate(om.ExplicitComponent):
def initialize(self):
self.options.declare('num_nodes', types=int)
def setup(self):
nn = self.options['num_nodes']
# Inputs
self.add_input('p', shape=(nn,), desc='Pressure inside the nox bottle', units='Pa')
self.add_input('Vb', shape=(nn,), desc='Bottle volume', units='m**3')
self.add_input('Vl', shape=(nn,), desc='Liquid volume', units='m**3')
self.add_input('Vl_dot', shape=(nn,), desc='Liquid volume flow rate', units='m**3/s')
self.add_input('gamma', shape=(nn,), desc='Heat capacity ratio')
# Outputs
self.add_output('p_dot', val=np.ones(nn), desc='Pressure change rate', units='Pa/s')
self.declare_partials(of='*', wrt='*', method='fd')
def compute(self, inputs, outputs):
p = inputs['p']
Vb = inputs['Vb']
Vl = inputs['Vl']
Vl_dot = inputs['Vl_dot']
gamma = inputs['gamma']
outputs['p_dot'] = gamma * p/(Vb - Vl) * Vl_dot
class VolumeFlowRate(om.ExplicitComponent):
"""
A Dymos ODE for a damped harmonic oscillator.
"""
def initialize(self):
self.options.declare('num_nodes', types=int)
def setup(self):
nn = self.options['num_nodes']
def setup(self):
# Inputs
self.add_input('p', desc='Pressure inside the nox_bottle', units='Pa')
self.add_input('pout', desc='Pressure outside the nox_bottle', units='Pa')
self.add_input('deltap', desc='Nox bottle pressure losses', units='Pa')
self.add_input('rhol', desc='Liquid density', units='kg/m**3')
self.add_input('Aout', desc='Output nox_bottle area', units='m**2')
# Outputs
self.add_output('Vl_dot', desc='Volume flow rate', units='m**3/s')
self.declare_partials(of='*', wrt='*', method='fd')
def compute(self, inputs, outputs):
p = inputs['p']
pout = inputs['pout']
deltap = inputs['deltap']
rhol = inputs['rhol']
Aout = inputs['Aout']
outputs['Vl_dot'] = Aout*np.sqrt(2/rhol*(p - pout - deltap))
class BottleModelODE(om.Group):
def initialize(self):
self.options.declare('num_nodes', types=int)
def setup(self):
nn = self.options['num_nodes']
self.add_subsystem('pressure_rate', subsys=PressureRate(num_nodes=nn),
promotes_inputs=['p', "Vb", "Vl", "Vl_dot", "gamma"], promotes_outputs=['p_dot'])
self.add_subsystem('volume_flow_rate', subsys=VolumeFlowRate(num_nodes=nn),
promotes_inputs=['p', "pout", 'deltap', 'rhol', "Aout"], promotes_outputs=['Vl_dot'])
self.connect('pressure_rate.p', 'volume_flow_rate.p')
self.connect('pressure_rate.Vl_dot', 'volume_flow_rate.Vl_dot')
Then to solve these equations and simulate my model, I build a program based on the oscillator example: https://openmdao.github.io/dymos/getting_started/intro_to_dymos/intro_segments.html
I am defining a phase called "explusion" by using the following function:
def expulsion_phase_fn(transcription: dm.transcriptions.pseudospectral.radau_pseudospectral.Radau, pamb: float):
phase = dm.Phase(ode_class=BottleModelODE, transcription=transcription)
phase.set_time_options(fix_initial=True, fix_duration=True)
phase.add_state('p', units='bar', rate_source='pressure_rate.p_dot',
targets=['pressure_rate.p', "volume_flow_rate.p"], fix_initial=True, fix_final=False, lower=pamb)
phase.add_state('Vl', units='m**3', rate_source='volume_flow_rate.Vl_dot', targets=['pressure_rate.Vl'],
fix_initial=True, fix_final=False, lower=0)
phase.add_parameter('Vb', targets=['pressure_rate.Vb'], units='m**3')
phase.add_parameter('gamma', targets=['pressure_rate.gamma'])
phase.add_parameter('rhol', targets=['volume_flow_rate.rhol'], units='kg/m**3')
phase.add_parameter('Aout', targets=['volume_flow_rate.Aout'], units='m**2')
phase.add_parameter('pout', targets=['volume_flow_rate.pout'], units="Pa")
phase.add_parameter('deltap', targets=['volume_flow_rate.deltap'], units="Pa")
return phase
Then, I am defining a trajectory with this function:
def trajectory(pamb: float):
transcript = dm.Radau(num_segments=50, solve_segments='forward')
traj = dm.Trajectory()
# Add phases to trajectory
expulsion_phase = traj.add_phase('expulsion',
expulsion_phase_fn(transcription=transcript, pamb=pamb))
return traj, expulsion_phase
And finally, I am setting the OpenMDAO problem, provide the initial values,... by doing the following lines, which are based on the Oscillator example:
def launch_compt():
# Set ambiant conditions
Tamb = 20 + 273.15
pamb = 100*10**3
deltap = 0
Vb = 5*10**-3
Aout = 10*10**-4
# Set NOX bottle properties up
bottle_params = {"Vb": 5*10**-3, "gamma": 1.4, "Aout": 3*10**-2, "rhol": 1000, "pout":
100*10**3, pinit": 300*10**3, "Vl": 1*10**-3}
# Instantiate an OpenMDAO Problem instance
prob = om.Problem(model=om.Group())
prob.driver = om.ScipyOptimizeDriver(optimizer='SLSQP')
# Instantiate a Dymos trjectory and add it to the Problem model
traj, phase = trajectory(pamb= 100*10*3)
phase.add_objective("time", loc="final")
# Setup the OpenMDAO problem
prob.model.add_subsystem("traj", traj)
prob.setup()
# Assign values to the times and states
prob.set_val('traj.explusion.t_initial', 0.0)
prob.set_val('traj.explusion.t_duration', 200.0)
prob.set_val('traj.explusion.states:p', bottle_params["pinit"])
prob.set_val('traj.explusion.states:Vl', bottle_params["Vl"])
prob.set_val('traj.explusion.parameters:Vb', bottle_params["Vb"])
prob.set_val('traj.explusion.parameters:gamma', bottle_params["gamma"])
prob.set_val('traj.explusion.parameters:rhol', bottle_params["rhol"])
prob.set_val('traj.explusion.parameters:Aout', bottle_params["Aout"])
prob.set_val('traj.explusion.parameters:pout', bottle_params["pout"])
prob.set_val('traj.explusion.parameters:deltap', bottle_params["deltap"])
prob.run_driver()
Unofortunately, that does not work I cannot understand why. It returns me that the parameter Vb (Bottle total volume) is not provided but I cannot understand why: it is provided when I am adding the parameters to the problem, like within the Oscillator example.
In that respect I am contacting, in the hope to find some help.
Thank in advance for any answer.
PS: Here is the error message that I get when I am trying to run the program:
raise ValueError(f'Invalid parameter in phase `{self.pathname}`.\n{str(e)}') from e
ValueError: Invalid parameter in phase `traj.phases.expulsion`.
Parameter `Vb` has invalid target(s).
No such ODE input: 'pressure_rate.Vb'.
The primary issue that you have asked about, related to the No such ODE input error, is cased by the way you coded your ODE and more specifically the way you promoted variables and then added the ODE to the phase.
For example, you promoted your input P then set the state target to pressure_rate.P. This is incorrect. When you promoted P that effectively moved the name address up to the top level of the ODE, so the name target is just P now. You can read more about promotion vs connection in the docs. You have this problem in most of your script, where you are not accounting for promotion when you set targets.
Unfortunately, this is not the only issue in your script. There are several more, and enough that I am not able to get things fully working.
Here are some other modest issues in rough order of importance:
The VolumeFlowRate component input and outputs are scalar, but seem to be intended to connect to the vector (of size num_nodes) variables of PressureRate. I suspect you meant to make them vector as well, but am not 100% sure
You have an execution order issue between PressureRate and VolumeRate. Pressure rate seems to need as an input Vl_dot, which comes from VolumeRate`, but you have added it first so it will run BEFORE the component providing its input value.
You had a typo in your set_val calls (explusion vs expulsion)
You did not have a deltap key in the parameter diction, but you did have a variable for it.
After fixing those, I could get the problem to start running but it did not converge or give an answer. You had solve_segments set to forward and had set 50 segments. Both of those seemed like bad settings to me, so I changed them to 3 segments, and removed the solve_segments option.
Then I was able to get the optimizer to take a few steps, but it errored with
Current function value: [200.]
Iterations: 6
Function evaluations: 12
Gradient evaluations: 2
Optimization FAILED.
Positive directional derivative for linesearch
Which indicated a problem with the derivatives. So I changed your setting for partial derivatives from fd to cs. That allowed it to iterate more, but still didn't converge. Without diving more into the physics of your problem I can't easily diagnose this further. I suspect you have some bad boundary conditions and probably bad initial guesses though.
Here is the modified script I came up with to at least get the optimizer iterating.
import numpy as np
import openmdao.api as om
import dymos as dm
class PressureRate(om.ExplicitComponent):
def initialize(self):
self.options.declare('num_nodes', types=int)
def setup(self):
nn = self.options['num_nodes']
# Inputs
self.add_input('p', shape=(nn,), desc='Pressure inside the nox bottle', units='Pa')
self.add_input('Vb', shape=(nn,), desc='Bottle volume', units='m**3')
self.add_input('Vl', shape=(nn,), desc='Liquid volume', units='m**3')
self.add_input('Vl_dot', shape=(nn,), desc='Liquid volume flow rate', units='m**3/s')
self.add_input('gamma', shape=(nn,), desc='Heat capacity ratio')
# Outputs
self.add_output('p_dot', val=np.ones(nn), desc='Pressure change rate', units='Pa/s')
self.declare_partials(of='*', wrt='*', method='cs')
def compute(self, inputs, outputs):
p = inputs['p']
Vb = inputs['Vb']
Vl = inputs['Vl']
Vl_dot = inputs['Vl_dot']
gamma = inputs['gamma']
outputs['p_dot'] = gamma * p/(Vb - Vl) * Vl_dot
class VolumeFlowRate(om.ExplicitComponent):
"""
A Dymos ODE for a damped harmonic oscillator.
"""
def initialize(self):
self.options.declare('num_nodes', types=int)
def setup(self):
nn = self.options['num_nodes']
# Inputs
self.add_input('p', shape=(nn,), desc='Pressure inside the nox_bottle', units='Pa')
self.add_input('pout', shape=(nn,), desc='Pressure outside the nox_bottle', units='Pa')
self.add_input('deltap', shape=(nn,), desc='Nox bottle pressure losses', units='Pa')
self.add_input('rhol', shape=(nn,), desc='Liquid density', units='kg/m**3')
self.add_input('Aout', shape=(nn,), desc='Output nox_bottle area', units='m**2')
# Outputs
self.add_output('Vl_dot', shape=(nn,), desc='Volume flow rate', units='m**3/s')
self.declare_partials(of='*', wrt='*', method='cs')
def compute(self, inputs, outputs):
p = inputs['p']
pout = inputs['pout']
deltap = inputs['deltap']
rhol = inputs['rhol']
Aout = inputs['Aout']
outputs['Vl_dot'] = Aout*np.sqrt(2/rhol*(p - pout - deltap))
class BottleModelODE(om.Group):
def initialize(self):
self.options.declare('num_nodes', types=int)
def setup(self):
nn = self.options['num_nodes']
self.add_subsystem('volume_flow_rate', subsys=VolumeFlowRate(num_nodes=nn),
promotes_inputs=['p', "pout", 'deltap', 'rhol', "Aout"], promotes_outputs=['Vl_dot'])
self.add_subsystem('pressure_rate', subsys=PressureRate(num_nodes=nn),
promotes_inputs=['p', "Vb", "Vl", "Vl_dot", "gamma"], promotes_outputs=['p_dot'])
def expulsion_phase_fn(transcription: dm.transcriptions.pseudospectral.radau_pseudospectral.Radau, pamb: float):
phase = dm.Phase(ode_class=BottleModelODE, transcription=transcription)
phase.set_time_options(fix_initial=True, fix_duration=True)
phase.add_state('p', units='bar', rate_source='p_dot',
targets=['p'], fix_initial=True, fix_final=False, lower=pamb)
phase.add_state('Vl', units='m**3', rate_source='Vl_dot', targets=['Vl'],
fix_initial=True, fix_final=False, lower=0)
phase.add_parameter('Vb', targets=['Vb'], units='m**3')
phase.add_parameter('gamma', targets=['gamma'])
phase.add_parameter('rhol', targets=['rhol'], units='kg/m**3')
phase.add_parameter('Aout', targets=['Aout'], units='m**2')
phase.add_parameter('pout', targets=['pout'], units="Pa")
phase.add_parameter('deltap', targets=['deltap'], units="Pa")
return phase
def trajectory(pamb: float):
# transcript = dm.Radau(num_segments=50, solve_segments='forward')
transcript = dm.Radau(num_segments=3)
traj = dm.Trajectory()
# Add phases to trajectory
expulsion_phase = traj.add_phase('expulsion', expulsion_phase_fn(transcription=transcript, pamb=pamb))
return traj, expulsion_phase
if __name__ == "__main__":
# Set ambiant conditions
Tamb = 20 + 273.15
pamb = 100*10**3
deltap = 0
Vb = 5*10**-3
Aout = 10*10**-4
# Set NOX bottle properties up
bottle_params = {"Vb": 5*10**-3, "gamma": 1.4, "Aout": 3*10**-2, "rhol": 1000, "pout": 100*10**3, "pinit": 300*10**3, "Vl": 1*10**-3}
# Instantiate an OpenMDAO Problem instance
prob = om.Problem(model=om.Group())
prob.driver = om.ScipyOptimizeDriver(optimizer='SLSQP')
# Instantiate a Dymos trjectory and add it to the Problem model
traj, phase = trajectory(pamb=100*10*3)
phase.add_objective("time", loc="final")
# Setup the OpenMDAO problem
prob.model.add_subsystem("traj", traj)
prob.setup()
# Assign values to the times and states
prob.set_val('traj.expulsion.t_initial', 0.0)
prob.set_val('traj.expulsion.t_duration', 200.0)
prob.set_val('traj.expulsion.states:p', bottle_params["pinit"])
prob.set_val('traj.expulsion.states:Vl', bottle_params["Vl"])
prob.set_val('traj.expulsion.parameters:Vb', bottle_params["Vb"])
prob.set_val('traj.expulsion.parameters:gamma', bottle_params["gamma"])
prob.set_val('traj.expulsion.parameters:rhol', bottle_params["rhol"])
prob.set_val('traj.expulsion.parameters:Aout', bottle_params["Aout"])
prob.set_val('traj.expulsion.parameters:pout', bottle_params["pout"])
prob.set_val('traj.expulsion.parameters:deltap', deltap)
prob.run_driver()
I am currently using an MPC to have the TCLab heater reach a certain set point temperature. I am trying to have an MHE update certain parameter values every 50 seconds. I have a previous MPC model that worked amazing and I tried to add a part in my main loop that has it switch to improve certain values and then switch back into MPC mode. I have seen that other people doing this same problem have made a gekko class for the MPC and also for the MHE and then had them work together, but is there a way that I can add a part in my current MPC loop that will allow the MHE to update certain values and then switch back into MPC?
Here is the Code that I added into my loop to have it update the variables but it isn't updating my values
if i%50 == 0 or i == 0:
m.options.IMODE = 5
Q1.STATUS = 0
Q1.FSTATUS = 1
Q2.STATUS = 0
Q2.FSTATUS = 1
U.FSTATUS = 1
α1.FSTATUS = 1
α2.FSTATUS = 1
τ.FSTATUS = 1
m.solve(disp = False)
Q1.STATUS = 1
Q1.FSTATUS = 1
Q2.STATUS = 1
Q2.FSTATUS = 1
m.options.IMODE = 6
U.FSTATUS = 0
α1.FSTATUS = 0
α2.FSTATUS = 0
τ.FSTATUS = 0
Gekko facilitates transfer of information between MHE and MPC but combining them into a single application isn't a current feature. Warmstart files est.t0 (MHE) and ctl.t0 (MPC) store the prior solution and use it to initialize the next solve. A file est.xfer (MHE) is a transfer file to update the initial conditions and parameters from the MHE application. You can look at these files by opening the run folder if remote=False (local solve):
mhe.open_folder()
mpc.open_folder()
Why a Single Application is Challenging
Gekko also uses a CSV file to transfer values and update the application before the next m.solve() command. Each variable x in Gekko only has one x.value. If you have an MHE and MPC application, you would need to manage how the x.value and all the options are reloaded for all of the variables before each of the m.solve() commands. It would be very tedious to manage that in a script, even with a deepcopy() function.
Create MHE and MPC Models in a Loop
An easier way is to create two separate models that are used for MHE and MPC. To facilitate this, the model can be built in a loop (see complete example) so that variables and equations are only defined once.
# initialize MHE and MPC
mhe = GEKKO(name='tclab-mhe')
mpc = GEKKO(name='tclab-mpc')
# create 2 models (MHE and MPC) in loop
for m in [mhe,mpc]:
# Adjustable Parameters
# heat transfer (W/m2-K)
m.U = m.FV(value=2.76,lb=1.0,ub=5.0)
# Semi-fundamental correlations (energy balances)
m.Equation(mass*Cp*m.TH1.dt() == m.U*A*(m.TaK-m.T1i) \
+ eps * sigma * A * (m.TaK**4 - m.T1i**4) \
+ m.Q_C12 + m.Q_R12 \
+ m.alpha1 * m.Q1)
# Empirical correlations (lag equations to emulate conduction)
m.Equation(m.tau * m.TC1.dt() == -m.TC1 + m.TH1)
After the equations are defined, the MHE and MPC applications can be configured with various options that are specific to that mode.
Application Specific Configuration
# ------------------------------
# Configure MHE
mhe.time = np.linspace(0,120,31)
mhe.options.IMODE = 5 # MHE
# FV tuning
mhe.U.STATUS = 1
mhe.Ta.STATUS = 0
# ------------------------------
# Configure MPC
mpc.time = [0,4,8,12,15,20,25,30,35,40,50,60,70,80,90]
# FV tuning
mpc.U.STATUS = 0
mpc.Ta.STATUS = 0
mpc.U.FSTATUS = 1
mpc.Ta.FSTATUS = 1
# Global Options
mpc.options.IMODE = 6 # MPC
If you'd like to easily transfer values between the MHE and MPC applications at every cycle then one option is to copy the est.xfer file from the MHE folder mhe.path into the MPC folder mpc.path. This will use the updated states and parameters from the MHE application in the MPC application.
After some research, I was able to predict the future value using the LSTM code below. I have also attached the Dmd1ahr.csv file in the github link that I am using.
https://github.com/ukeshchawal/hello-world/blob/master/Dmd1ahr.csv
As you all can see below, 90 data points are training sets and 91st to 100th are future value prediction.
However some of the questions that I still have are:
In order to predict these values I had to originally take more than hundred data sets (here, I have taken 500 data sets) which is not exactly what my primary goal is. Is there a way that given 500 data sets, it will predict the rest 10 or 20 out of sample data points? If yes, will you please write me a sample code where you can just take 500 data points from Dmd1ahr.csv file attached below and it will predict some future values (say 501 to 520) based on those 500 points?
The prediction are way off compared to the one who have in your blogs (definitely indicates for parameter tuning - I tried changing epochs, LSTM layers, Activation, optimizer). What other parameter tuning I can do to make it more robust?
Thank you'll in advance.
import numpy as np
import matplotlib.pyplot as plt
import pandas
# By twaking the architecture it could be made more robust
np.random.seed(7)
numOfSamples = 500
lengthTrain = 90
lengthValidation = 100
look_back = 1 # Can be set higher, in my experiments it made performance worse though
transientTime = 90 # Time to "burn in" time series
series = pandas.read_csv('Dmd1ahr.csv')
def generateTrainData(series, i, look_back):
return series[i:look_back+i+1]
trainX = np.stack([generateTrainData(series, i, look_back) for i in range(lengthTrain)])
testX = np.stack([generateTrainData(series, lengthTrain + i, look_back) for i in range(lengthValidation)])
trainX = trainX.reshape((lengthTrain,look_back+1,1))
testX = testX.reshape((lengthValidation, look_back + 1, 1))
trainY = trainX[:,1:,:]
trainX = trainX[:,:-1,:]
testY = testX[:,1:,:]
testX = testX[:,:-1,:]
############### Build Model ###############
import keras
from keras.models import Model
from keras import layers
from keras import regularizers
inputs = layers.Input(batch_shape=(1,look_back,1), name="main_input")
inputsAux = layers.Input(batch_shape=(1,look_back,1), name="aux_input")
# this layer makes the actual prediction, i.e. decides if and how much it goes up or down
x = layers.recurrent.LSTM(300,return_sequences=True, stateful=True)(inputs)
x = layers.recurrent.LSTM(200,return_sequences=True, stateful=True)(inputs)
x = layers.recurrent.LSTM(100,return_sequences=True, stateful=True)(inputs)
x = layers.recurrent.LSTM(50,return_sequences=True, stateful=True)(inputs)
x = layers.wrappers.TimeDistributed(layers.Dense(1, activation="linear",
kernel_regularizer=regularizers.l2(0.005),
activity_regularizer=regularizers.l1(0.005)))(x)
# auxillary input, the current input will be feed directly to the output
# this way the prediction from the step before will be used as a "base", and the Network just have to
# learn if it goes a little up or down
auxX = layers.wrappers.TimeDistributed(layers.Dense(1,
kernel_initializer=keras.initializers.Constant(value=1),
bias_initializer='zeros',
input_shape=(1,1), activation="linear", trainable=False
))(inputsAux)
outputs = layers.add([x, auxX], name="main_output")
model = Model(inputs=[inputs, inputsAux], outputs=outputs)
model.compile(optimizer='adam',
loss='mean_squared_error',
metrics=['mean_squared_error'])
#model.summary()
#model.fit({"main_input": trainX, "aux_input": trainX[look_back-1,look_back,:]},{"main_output": trainY}, epochs=4, batch_size=1, shuffle=False)
model.fit({"main_input": trainX, "aux_input": trainX[:,look_back-1,:].reshape(lengthTrain,1,1)},{"main_output": trainY}, epochs=100, batch_size=1, shuffle=False)
############### make predictions ###############
burnedInPredictions = np.zeros(transientTime)
testPredictions = np.zeros(len(testX))
# burn series in, here use first transitionTime number of samples from test data
for i in range(transientTime):
prediction = model.predict([np.array(testX[i, :, 0].reshape(1, look_back, 1)), np.array(testX[i, look_back - 1, 0].reshape(1, 1, 1))])
testPredictions[i] = prediction[0,0,0]
burnedInPredictions[:] = testPredictions[:transientTime]
# prediction, now dont use any previous data whatsoever anymore, network just has to run on its own output
for i in range(transientTime, len(testX)):
prediction = model.predict([prediction, prediction])
testPredictions[i] = prediction[0,0,0]
# for plotting reasons
testPredictions[:np.size(burnedInPredictions)-1] = np.nan
############### plot results ###############
#import matplotlib.pyplot as plt
plt.plot(testX[:, 0, 0])
plt.show()
plt.plot(burnedInPredictions, label = "training")
plt.plot(testPredictions, label = "prediction")
plt.legend()
plt.show()
I have a pandas multilevel dataframe df to contain the quarterly financial report data for about 2000+ stocks from year 2006 to 2012 . And I am trying to figure out a way to quickly calculate the 'average' values for each data point.
demo_data() is the function to generate the demo data (df = demo_data(stk_qty=2000, col_num=200) can be used to simulate the financial report data):
def demo_data(stk_qty, col_num):
''' generate demo data, return multilevel dataframe '''
import random
import pandas as pd
rpt_date_template = [(yr+qt) for yr in map(str, range(2006, 2013)) for qt in ['0331','0630','0930','1231']]
stk_id_list = ['STK'+str(x).zfill(3) for x in range(0, stk_qty)]
stk_id_column, rpt_date_column = [], []
for i in range(stk_qty):
stk_rpt_date_list = rpt_date_template[random.randint(0,8):] # rpt dates with random start
stk_id_column.extend([stk_id_list[i]] * len(stk_rpt_date_list))
rpt_date_column.extend(stk_rpt_date_list)
index_name = ['STK_ID', 'RPT_Date']
col_name = ['COL'+str(x).zfill(3) for x in range(col_num)]
first_level_dt = stk_id_column
second_level_dt = rpt_date_column
dt = pd.DataFrame(np.random.randn(len(stk_id_column), col_num), columns=col_name)
dt[index_name[0]] = first_level_dt
dt[index_name[1]] = second_level_dt
multilevel_df = dt.set_index(index_name, drop=True, inplace=False)
return multilevel_df
Here is a sample data. (note: sw() is a method to display the four corners data of a big dataframe, source code is at: How to preview a part of a large pandas DataFrame? )
>>> df = demo_data(5,3)
>>> df.sw()
COL000 COL001 COL002
STK_ID RPT_Date
STK000 20060630 1.8196 0.9519 -1.0526
20060930 -0.4074 -0.9025 1.3562
20061231 -1.1750 0.4190 -1.2976
20070331 -0.5609 1.5190 0.4893
20070630 0.4580 -0.3804 0.3705
20070930 -0.4711 -1.1953 -0.0609
20071231 0.3363 1.1949 1.2802
20080331 1.6359 0.8355 -0.2763
20080630 0.2697 -0.8236 -1.7095
20080930 0.6178 -0.3742 -1.1646
.......................................
STK004 20111231 -0.3198 1.6972 -1.3281
20120331 -1.1905 -0.4597 0.3695
20120630 -0.8253 -0.0502 -0.2862
20120930 0.0059 -1.8535 -1.2107
20121231 0.5762 -0.2872 0.0993
Index : ['STK_ID', 'RPT_Date']
Column: COL000,COL001,COL002
row: 117 col: 3
The customized average function I want is named as my_avg() and defined as below rules:
1. Q1's average value is (Q4_of_previous_yr + Q1)/2
2. Q2's average value is (Q4_of_previous_yr + Q1 + Q2)/3
3. Q3's average value is (Q4_of_previous_yr + Q1 + Q2 + Q3)/4
4. Q4's average value is (Q4_of_previous_yr + Q1 + Q2 + Q3 + Q4)/5
5. if some of the data points are not provided, just calculate the normal average of available data points
so the my_avg(df) will have below output for each STK_ID:
STK_ID RPT_Date COL000 COL001 COL002
STK000 20060630 1.819619705 0.951918984 -1.052639309
20060930 0.706112476 0.024688028 0.151757352
20061231 0.079077767 0.156125083 -0.331359614
20070331 -0.867930112 0.969000466 -0.404129827
20070630 -0.425943376 0.519205768 -0.145929753
20070930 -0.437234418 0.090579744 -0.124681449
20071231 -0.282524858 0.3114374 0.156297097
20080331 0.986121631 1.015202552 0.501971496
.......................................
STK004 20111231 xxxxx xxxxxxx xxxxxxx
How to write the code for my_avg() ?
Reference:
I try to write a temp_solution_avg() function. But it has three issues:
1. the average calculation not include 'Q4_of_previous_yr' data point, so the result is not what I want.
2. data's 'RPT_Date' must start with Q1 ('xxxx0331'), otherwise first yr's data is wrong
3. the calculation speed is very very slow.
In [3]: df = demo_data(500,100)
In [4]: timeit temp_solution_avg(df)
1 loops, best of 3: 66.3 s per loop
def temp_solution_avg(df):
''' return the average , Q1: not change, Q2 : (df.Q1 + df.Q2)/2 ,
Q3: (df.Q1 + df.Q2 + df.Q3)/3, Q4 : (df.Q1 + df.Q2 + df.Q3 + df.Q4)/4
data's 'RPT_Date' must start with Q1 ('xxxx0331'), otherwise first yr's
data is wrong .
'''
dt = df.reset_index()
dt['yr'] = dt['RPT_Date'].str[0:4]
dt['temp_stk_id'] = dt['STK_ID']
dt = dt.set_index(['STK_ID','RPT_Date'], drop=True, inplace=False)
rst = dt.groupby(['temp_stk_id','yr']).transform(pd.expanding_mean)
return rst