Is there a way to plot confusion matrix from H2O? - h2o

I know H2O can use
model_perf = model.model_performance(input)
model_perf.confusion_matrix
to output the confusion matrix. But is there a way to get the confusion matrix table to create plot?

You have the function you need as indicated here. So you just need to convert the output of your H2OFrames to a Pandas Dataframe. Example is shown below:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
%matplotlib inline
h2o.init()
h2o.cluster().show_status()
# import the cars dataset:
# this dataset is used to classify whether or not a car is economical based on
# the car's displacement, power, weight, and acceleration, and the year it was made
cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
# print(cars["economy_20mpg"].isna().sum())
cars[~cars["economy_20mpg"].isna()]["economy_20mpg"].isna().sum()
cars = cars[~cars["economy_20mpg"].isna()]
# convert response column to a factor
cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
# set the predictor names and the response column name
predictors = ["displacement","power","weight","acceleration","year"]
response = "economy_20mpg"
# split into train and validation sets
train, valid = cars.split_frame(ratios = [.8], seed = 1234)
# try using the `y` parameter:
# first initialize your estimator
cars_gbm = H2OGradientBoostingEstimator(seed = 1234, sample_rate=.5)
# then train your model, where you specify your 'x' predictors, your 'y' the response column
# training_frame and validation_frame
cars_gbm.train(x = predictors, y = response, training_frame = train, validation_frame = valid)
function from sklearn:
def plot_confusion_matrix(y_true, y_pred, classes,
normalize=False,
title=None,
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if not title:
if normalize:
title = 'Normalized confusion matrix'
else:
title = 'Confusion matrix, without normalization'
# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)
# Only use the labels that appear in the data
classes = classes[unique_labels(y_true, y_pred)]
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
ax.figure.colorbar(im, ax=ax)
# We want to show all ticks...
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
# ... and label them with the respective list entries
xticklabels=classes, yticklabels=classes,
title=title,
ylabel='True label',
xlabel='Predicted label')
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
# Loop over data dimensions and create text annotations.
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(j, i, format(cm[i, j], fmt),
ha="center", va="center",
color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()
return ax
extract values
# specify the threshold you want to use to create integer labels
maxf1_threshold = cars_gbm.find_threshold_by_max_metric('f1')
# specify your tru and prediciton labels
y_true = cars["economy_20mpg"].as_data_frame()
y_pred = cars_gbm.predict(cars)
# convert prediction labels (original uncalibrated probabilities into integer labels)
y_pred = (y_pred['p1'] >= maxf1_threshold).ifelse(1,0)
y_pred = y_pred.as_data_frame()
y_pred.columns = ['p1']
y_true1 = y_true.economy_20mpg
y_pred1 = y_pred.p1
class_names = np.array(cars["economy_20mpg"].levels()[0])
# Plot non-normalized confusion matrix
plot_confusion_matrix(y_true1, y_pred1, classes=class_names,
title='Confusion matrix')
image result:
Please note that there is a bug in the H2O-3 confusion matrix that has been noted here

Related

How does skimage.segmentation.slic achieve segmentation under non-binary masks?

Slic can implement segmentation under binarized masks, as shown in the figure below
from https://scikit-image.org/docs/dev/auto_examples/segmentation/plot_mask_slic.html
But if I need to divide the superpixels of different adjacent regions, what should I do?
Each color represents an area, each region requires independent superpixel segmentation
There is not currently any way to handle a mask with multiple regions in a single call. For your use case you will have to split each region into a separate mask and then call slic once per mask. You can combine the multiple segmentations into one by incrementing the labels appropriately.
Pasted below is a concrete example of this for two separate masked regions (adapted from the existing example you referenced):
import matplotlib.pyplot as plt
import numpy as np
from skimage import data
from skimage import color
from skimage import morphology
from skimage import segmentation
# Input data
img = data.immunohistochemistry()
# Compute a mask
lum = color.rgb2gray(img)
mask = morphology.remove_small_holes(
morphology.remove_small_objects(
lum < 0.7, 500),
500)
mask1 = morphology.opening(mask, morphology.disk(3))
# create a second mask as the inverse of the first
mask2 = ~mask1
segmented = np.zeros(img.shape[:-1], dtype=np.int64)
max_label = 0
# replace [mask2, mask1] with a list of any number of binary masks
for mask in [mask2, mask1]:
# maskSLIC result
m_slic = segmentation.slic(img, n_segments=100, mask=mask, start_label=1)
if max_label > 0:
# offset the labels by the current maximum label
m_slic += max_label
# add the label into the current combined segmentation
segmented += m_slic
# increment max label
max_label += m_slic.max()
# Display result
fig, ax_arr = plt.subplots(2, 2, sharex=True, sharey=True, figsize=(10, 10))
ax1, ax2, ax3, ax4 = ax_arr.ravel()
ax1.imshow(img)
ax1.set_title('Original image')
ax2.imshow(mask, cmap='gray')
ax2.set_title('Mask')
ax3.imshow(segmentation.mark_boundaries(img, m_slic))
ax3.contour(mask, colors='red', linewidths=1)
ax3.set_title('maskSLIC (mask1 only)')
ax4.imshow(segmentation.mark_boundaries(img, segmented))
ax4.contour(mask, colors='red', linewidths=1)
ax4.set_title('maskSLIC (both masks)')
for ax in ax_arr.ravel():
ax.set_axis_off()
plt.tight_layout()
plt.show()
The basic approach I am suggesting is in the for loop above. Most of the other code is just generating the data and plots.

How to use a vector set point with mpc inorder to give program information on how the future set point will change

I am using an MPC to run a heater system. Currently I have it take an individual value from my set point array at a given point in time to adjust the process to reach. I would like to be able to give it the current desired value and a couple of points in the future for the set point so that it can better adjust as the set point changes. How can I give gekko a vector in order to have it better adjust to future set points?
this is the part of my code that currently updates my setpoint values.
T1[i] = a.T1
T2[i] = a.T2
TC1.MEAS = T1[i]
TC2.MEAS = T2[i]
DT = .1
TC1.SPHI = sp1[i] + DT #sp1 and sp2 are set point arrays for the two heaters
TC1.SPLO = sp1[i] - DT
TC2.SPHI = sp2[i] + DT
TC2.SPLO = sp2[i] - DT
m.solve(disp=False)
The gekko CV object only uses a scalar value of an array for SP, SPHI, and SPLO so some modification is needed to have the optimizer consider future setpoint changes. A simple MPC application shows how setpoints are used in Gekko.
from gekko import GEKKO
import numpy as np
import matplotlib.pyplot as plt
m = GEKKO()
m.time = np.linspace(0,20,41)
p = m.MV(value=0, lb=0, ub=100) # Declare MV
p.STATUS = 1 # allow optimizer to change
p.DCOST = 0.1 # smooth MV response
p.DMAX = 10.0 # max move each cycle
v = m.CV(value=0) # Declare CV
v.STATUS = 1 # add CV to the objective
m.options.CV_TYPE = 2 # squared error
v.SP = 40 # set point
v.TR_INIT = 1 # set point trajectory
v.TAU = 5 # time constant of trajectory
m.Equation(10*v.dt() == -v + 2*p)
m.options.IMODE = 6 # control
m.solve(disp=False)
# get additional solution information
import json
with open(m.path+'//results.json') as f:
results = json.load(f)
plt.figure()
plt.subplot(2,1,1)
plt.plot(m.time,p.value,'b-',label='MV Optimized')
plt.legend()
plt.ylabel('Input')
plt.subplot(2,1,2)
plt.plot(m.time,results['v1.tr'],'k-',label='Reference Trajectory')
plt.plot(m.time,v.value,'r--',label='CV Response')
plt.ylabel('Output')
plt.xlabel('Time')
plt.legend(loc='best')
plt.show()
There is one setpoint value that is a target of 40 and a reference trajectory is defined with a time constant of 5. The MPC is not able to follow the reference trajectory exactly because of a rate of change constraint with DMAX=10. There are two options if you'd like the optimizer to know about future setpoint changes.
Option 1: Don't use CV, Use Feedforward Parameter
If you don't need the reference trajectory and are okay with a squared error objective then then easiest method to project future setpoint changes is to define your own MPC objective with a feedforward parameter vector of setpoint values. The example problem shows that the optimizer is anticipating the setpoint change and proactively moves before the next setpoint change to minimize the overall sum of squared error. This may be desirable in many circumstances but may be undesirable in manufacturing where there are product grade changes and the end of the production campaign should be in-spec before producing transition material.
from gekko import GEKKO
import numpy as np
import matplotlib.pyplot as plt
m = GEKKO()
m.time = np.linspace(0,20,41)
p = m.MV(value=0, lb=0, ub=100) # Declare MV
p.STATUS = 1 # allow optimizer to change
p.DCOST = 0.1 # smooth MV response
p.DMAX = 10.0 # max move constraint
v = m.Var(value=0)
sp = np.ones(41)*40
sp[20:] = 60
s = m.Param(value=sp)
m.Obj((s-v)**2)
m.Equation(10*v.dt() == -v + 2*p)
m.options.IMODE = 6 # control
m.solve(disp=False)
plt.figure()
plt.subplot(2,1,1)
plt.plot(m.time,p.value,'b-',label='MV Optimized')
plt.legend()
plt.ylabel('Input')
plt.subplot(2,1,2)
plt.plot(m.time,sp,'k-',label='Setpoint')
plt.plot(m.time,v.value,'r--',label='CV Response')
plt.ylabel('Output')
plt.xlabel('Time')
plt.legend(loc='best')
plt.show()
Option 2: Error as CV
If it is desirable to use the reference trajectory and Gekko built-in CV options, then an option is to define a new error variable e and control that instead. The error variable always has a setpoint of zero and the feedforward setpoint is implemented as a feedforward parameter.
from gekko import GEKKO
import numpy as np
import matplotlib.pyplot as plt
m = GEKKO()
m.time = np.linspace(0,20,41)
p = m.MV(value=0, lb=0, ub=100) # Declare MV
p.STATUS = 1 # allow optimizer to change
p.DCOST = 0.1 # smooth MV response
p.DMAX = 10.0 # max move constraint
v = m.Var(value=0)
sp = np.ones(41)*40
sp[20:] = 60
s = m.Param(value=sp)
e = m.CV(value=0) # Declare CV
e.STATUS = 1 # add CV to the objective
m.options.CV_TYPE = 2 # squared error
e.SP = 0 # set point
e.TR_INIT = 1 # error trajectory
e.TAU = 5 # time constant of trajectory
m.Equation(e==s-v)
m.Equation(10*v.dt() == -v + 2*p)
m.options.IMODE = 6 # control
m.solve(disp=False)
plt.figure()
plt.subplot(2,1,1)
plt.plot(m.time,p.value,'b-',label='MV Optimized')
plt.legend()
plt.ylabel('Input')
plt.subplot(2,1,2)
plt.plot(m.time,sp,'k-',label='Setpoint')
plt.plot(m.time,v.value,'r--',label='CV Response')
plt.ylabel('Output')
plt.xlabel('Time')
plt.legend(loc='best')
plt.show()
For every time step, Gekko automatically generates the setpoint in the form of the array for the future control horizon. And, the arrays are usually filled with the single value that you assigned. However, you can give the setpoint as an array as shown below.
sp1 = np.array([[1,2,3,4,5],
[2,3,4,5,6],
[3,4,5,6,7],
[4,5,6,7,8]])
Then, you can assign the every row of your matrix for each time step just as you did in your question.
DT = .1
TC1.SPHI = sp1[i] + DT
Note:
You need to have the same length of setpoint array which means the size of the column in your setpoint matrix with the control horizon (e.g. 'm.time').
You might want to set the setpoint trajectory option '0' if you don't want to filter out your setpoint sequence in your array. (TR_INIT = 0)

Is it possible to get the data of biased and unbiased predicted controlled variables when using APMonitor for Model Predictive Control in Python?

I am trying to build a python code for Model Predictive Control using APMonitor. However, I don't want to get the results on an third party online server. Hence, I want to collect the data of the predicted biased and unbiased and plot them on Python myself.
Try this in Python Gekko:
# get additional solution information
import json
with open(m.path+'//results.json') as f:
results = json.load(f)
You can get the unbiased model result by getting the dictionary value of your variable v with v.name. You can get the biased model prediction with v.name+'.bcv'. Here is an example that also shows how to get the raw trajectory information.
This gives you access to the raw data. An example shows how to plot from the JSON data.
from gekko import GEKKO
import numpy as np
import matplotlib.pyplot as plt
m = GEKKO()
m.time = np.linspace(0,20,41)
# Parameters
mass = 500
b = m.Param(value=50)
K = m.Param(value=0.8)
# Manipulated variable
p = m.MV(value=0, lb=0, ub=100)
p.STATUS = 1 # allow optimizer to change
p.DCOST = 0.1 # smooth out gas pedal movement
p.DMAX = 20 # slow down change of gas pedal
# Controlled Variable
v = m.CV(value=0)
v.STATUS = 1 # add the SP to the objective
m.options.CV_TYPE = 2 # squared error
v.SP = 40 # set point
v.TR_INIT = 1 # set point trajectory
v.TAU = 5 # time constant of trajectory
# Process model
m.Equation(mass*v.dt() == -v*b + K*b*p)
m.options.IMODE = 6 # control
m.solve(disp=False,GUI=True)
# get additional solution information
import json
with open(m.path+'//results.json') as f:
results = json.load(f)
plt.figure()
plt.subplot(2,1,1)
plt.plot(m.time,p.value,'b-',label='MV Optimized')
plt.legend()
plt.ylabel('Input')
plt.subplot(2,1,2)
plt.plot(m.time,results['v1.tr'],'k-',label='Reference Trajectory')
plt.plot(m.time,v.value,'r--',label='CV Response')
plt.ylabel('Output')
plt.xlabel('Time')
plt.legend(loc='best')
plt.show()

How does statsmodels ARIMA.forecast() work?

I simulated an ARMA Process and tried to forecast it with statsmodels.
I plotted the true value and the forecasted values.
I read that out-of-sample forecasts tend to converge to the sample mean for a long forecasting period. Can someone describe how this forecasts are calculated? I read in the documentation that they transform the arma model into a state space model and then forecast the next value via kalman filter, is this correct? Is then the calculated value t+1 used for the next prediction t+2?
However if I created another AR process and shifted it upwards
and mix the two arrays and then again do a forecast.
How does statsmodels ARIMA forecast "learn" the ups and downs of the data?
import matplotlib.pyplot as plt
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.tsa.arima_model import ARIMA
ar = np.array([1, -0.9])
ma = np.array([1])
AR_object = ArmaProcess(ar, ma)
a = AR_object.generate_sample(nsample=200)
b = AR_object.generate_sample(nsample=200)+10
ab = [item for sublist in zip(a, b) for item in sublist]
train = a[:100]
test = a[100:]
train_ab = ab[:100]
test_ab = ab[100:200]
plt.plot(a)
plt.plot(b)
model = ARIMA(train, order=(1,0,1))
model_fit = model.fit()
forecast = model_fit.forecast(steps=len(test))[0]
plt.figure()
plt.plot(test, color= 'green')
plt.plot(forecast, color='red')
plt.figure()
model = ARIMA(train_ab, order=(1,0,1))
model_fit = model.fit()
forecast_ab = model_fit.forecast(steps=len(test_ab))[0]
plt.plot(test_ab, color= 'green')
plt.plot(forecast_ab, color='red')

How do I perform a curve fit with an array of points and touching a specific point in that array

I need help with curve fitting a given set of points. The points form a parabola and I ought to find the peak point of the result. Issue is when I do a curve fit, it sometimes doesn't touch the max y-coordinate even if the actual point is given in the input array.
Following is the code snippet. Here 1.88 is the actual peak y-coordinate (13.05,1.88). But the graph generated by the code does not touch the point due to curve fitting. So is there a way to fit the curve making sure that it touches the max point given in the input array?
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit, minimize_scalar
fig = plt.gcf()
#fig.set_size_inches(18.5, 10.5)
x = [4.59,9.02,13.05,18.47,20.3]
y = [1.7,1.84,1.88,1.7,1.64]
def f(x, p1, p2, p3):
return p3*(p1/((x-p2)**2 + (p1/2)**2))
plt.plot(x,y,"ro")
popt, pcov = curve_fit(f, x, y)
# find the peak
fm = lambda x: -f(x, *popt)
r = minimize_scalar(fm, bounds=(1, 5))
print( "maximum:", r["x"], f(r["x"], *popt) ) #maximum: 2.99846874275 18.3928199902
plt.text(1,1.9,'maximum '+str(round(r["x"],2))+'( #'+str(round(f(r["x"], *popt),2)) + ' )')
x_curve = np.linspace(min(x), max(x), 50)
plt.plot(x_curve, f(x_curve, *popt))
plt.plot(r['x'], f(r['x'], *popt), 'ko')
plt.show()
Here is a graphical code example using your equation with weighted fitting, where I have made the max point larger to more easily see the effect of the weighting. In non-weighted curve fitting, all weights are implicitly 1.0 as all data points have equal weight. Scipy's curve_fit routine uses weights in the form of uncertainties, so that giving a point a very small uncertainty (which I have done) is like giving the point a very large weight. This technique can be used to make a fit pass arbitrarily close to any single data point by any software that can perform weghted fitting.
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
x = [4.59,9.02,13.05,18.47,20.3]
y = [1.7,1.84,2.0,1.7,1.64]
# note the single very small uncertainty - try making this value 1.0
uncertainties = numpy.array([1.0, 1.0, 1.0E-6, 1.0, 1.0])
# rename data to use previous example
xData = numpy.array(x)
yData = numpy.array(y)
def func(x, p1, p2, p3):
return p3*(p1/((x-p2)**2 + (p1/2)**2))
# these are the same as the scipy defaults
initialParameters = numpy.array([1.0, 1.0, 1.0])
# curve fit the test data, first without uncertainties to
# get us closer to initial starting parameters
ssqParameters, pcov = curve_fit(func, xData, yData, p0 = initialParameters)
# now that we have better starting parameters, use uncertainties
fittedParameters, pcov = curve_fit(func, xData, yData, p0 = ssqParameters, sigma=uncertainties, absolute_sigma=True)
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('Parameters:', fittedParameters)
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)

Resources