Generating random series based on means and standard deviations from a dataframe - random

I am practicing with dataframe and I have a table like this:
example table
How can I generate 18 x 30 = 540 data points in which each 30 dataset is a random series of n=30, A_i, stdev_A_i (i=1->18).
The method of
rnorm2 <- function(n,mean,sd) { mean+sd*scale(rnorm(n)) }
r <- rnorm2(30,A,Stdev_A)
print(unname(as.data.frame(r)),quote = FALSE, row.names = FALSE)
only generates 30 data points at a time and I have to manually do it for 18 times.
Thank you.

data <- read.csv("data.csv", header = T)
normv <- function( n , mean , sd ){out <- rnorm( n*length(mean) , mean = mean , sd = sd)
return( matrix( out , nrow = n , , byrow = F ) )}
set.seed(1)
normv( 30 , data$A , data$Stdev_A )

Related

Is there a way that I can solve or stop this error problem in cardinality_threshold?

I have about 17 soil variables that I'd like to run correlations with elevations, temperature and rainfall against species richness and abundance. I have 39 plots (rows) and the columns contain, environmental variables such as elevation, abundance, species richness, temperature, rainfall and then the list of soil variables (17 columns). Below is my script.
Is there a problem with my script or is it the laptop compatibility of the mac I am using? Please help. Thanks
After running the codes, I am getting this error:
Error in stop_if_high_cardinality(data, columns, cardinality_threshold) :
Column 'pH' has more levels (24) than the threshold (15) allowed.
Please remove the column or increase the 'cardinality_threshold' parameter. Increasing the cardinality_threshold may produce long processing times
GGally::ggpairs(
na.omit(nfi_nontree_soilclim_data[, c(11:18)]),
upper = list(
continuous = wrap(
custom_ggally_cor,
method = "spearman", exact = FALSE,
size = 2.5, col = "black", family = "serif", digits = 2
), combo = "box_no_facet", discrete = "count", na = "na"
),
lower = list(
continuous = wrap(
ggally_smooth,
method = "loess", formula = y ~ x,
se = F, lwd = 3, col = "red", shrink = T
), combo = "facethist", discrete = "facetbar", na = "na"
),
diag = list(
continuous = wrap(
ggally_densityDiag,
col = "darkgrey", lwd = .1,
stat = "density", fill = "darkgrey"
), continuous = "densityDiag", na = "naDiag"
), axisLabels = c("show")
) + theme_bw() + theme(
text = element_text(family = "serif", size = 4),
axis.text = element_text(family = "serif", size = 4),
panel.grid = element_blank()
)```
This error is a built-in stop because the default parameter is set to only allow 15 levels of a variable to be displayed in one graph. You have 24 levels for one of your variables, so you can either adjust the parameter, i.e., the cardinality_threshold, to that value of 24 or set it to NULL. Null may be more generalizable if the value of 24 isn't always the same. But in general, that number of levels depicted at once is going to be discouraged and have these stop-limits.
library(GGally)
data(iris)
Create data that has factor of more than 15 levels
iris$group = as.factor(sample(sample(letters,16), 150, replace = TRUE))
Just demonstrating that either entry can work
ggpairs(iris, cardinality_threshold = 16)
ggpairs(iris, cardinality_threshold = NULL)

Error: Model is large in H2o autoencoder training

I have a table of 5360*51200 size. Here, 5360 are the number of instances and 51200 are the number of features. I need to reduce the dimension of features. I was trying it by the help of stacked autoencoder in H2o, but it did not allow me to train to raise an error as:
Model is a large and large number of parameters
Here is the code:
library(h2o)
h2o.init(nthreads = -1)
check.deeplearning_stacked_autoencoder <- function() {
# this function builds a vector of autoencoder models, one per layer
#library(h2o)
#h2o.init()
get_stacked_ae_array <- function(training_data, layers, args) {
vector <- c()
index = 0
for (i in 1:length(layers)) {
index = index + 1
ae_model <- do.call(h2o.deeplearning,
modifyList(
list(
x = names(training_data),
training_frame = training_data,
autoencoder = T,
hidden = layers[i]
),
args
))
training_data = h2o.deepfeatures(ae_model, training_data, layer =
3)
names(training_data) <-
gsub("DF", paste0("L", index, sep = ""), names(training_data))
vector <- c(vector, ae_model)
}
cat(
length(vector))
}
# this function returns final encoded contents
apply_stacked_ae_array <- function(data, ae) {
index = 0
for (i in 1:length(ae)) {
index = index + 1
data = h2o.deepfeatures(ae[[i]], data, layer = 3)
names(data) <-
gsub("DF", paste0("L", index, sep = ""), names(data))
}
data
}
TRAIN <-
"E:/Chiranjibi file/Geometric features/Lu/Train/d_features.csv"
TEST <-
"E:/Chiranjibi file/Geometric features/Lu/Test/d_features.csv"
response <- 51201
# set to T for RUnit
# set to F for stand-alone demo
if (T) {
train_hex <- h2o.importFile((TRAIN))
test_hex <- h2o.importFile((TEST))
} else
{
library(h2o)
h2o.init()
homedir <-
paste0(path.expand("~"), "/h2o-dev/") #modify if needed
train_hex <-
h2o.importFile(path = paste0(homedir, TRAIN),
header = F,
sep = ',')
test_hex <-
h2o.importFile(path = paste0(homedir, TEST),
header = F,
sep = ',')
}
train <- train_hex[, -response]
test <- test_hex [, -response]
train_hex[, response] <- as.factor(train_hex[, response])
test_hex [, response] <- as.factor(test_hex [, response])
## Build reference model on full dataset and evaluate it on the test set
model_ref <-
h2o.deeplearning(
training_frame = train_hex,
x = 1:(ncol(train_hex) - 1),
y = response,
hidden = c(67),
epochs = 50
)
p_ref <- h2o.performance(model_ref, test_hex)
h2o.logloss(p_ref)
## Now build a stacked autoencoder model with three stacked layer AE models
## First AE model will compress the 717 non-const predictors into 200
## Second AE model will compress 200 into 100
## Third AE model will compress 100 into 50
layers <- c(50000,20000,10000,5000,2000, 1000, 500)
args <- list(activation = "Tanh",
epochs = 1,
l1 = 1e-5)
ae <- get_stacked_ae_array(train, layers, args)
## Now compress the training/testing data with this 3-stage set of AE models
train_compressed <- apply_stacked_ae_array(train, ae)
test_compressed <- apply_stacked_ae_array(test, ae)
## Build a simple model using these new features (compressed training data) and evaluate it on the compressed test set.
train_w_resp <- h2o.cbind(train_compressed, train_hex[, response])
test_w_resp <- h2o.cbind(test_compressed, test_hex[, response])
model_on_compressed_data <-
h2o.deeplearning(
training_frame = train_w_resp,
x = 1:(ncol(train_w_resp) - 1),
y = ncol(train_w_resp),
hidden = c(67),
epochs = 1
)
p <- h2o.performance(model_on_compressed_data, test_w_resp)
h2o.logloss(p)
}
#h2o.describe(train)
#doTest("Deep Learning Stacked Autoencoder", check.deeplearning_stacked_autoencoder)
As Tom says, your autoencoder first layer is too big.
51,200 is a lot of features. How much correlation is there between them? The more correlation you have, the smaller the first layer of your autoencoder can happily be.
Try h2o.prcomp() and seeing how many dimensions cover 99% of the variance, is often a good guide to how big your first layer can/should be.
Or, if you prefer a more experimental approach:
Start with, e.g. 200 neurons in one layer.
Look at the MSE it gets to, after enough epochs to stop improving.
Double the number of neurons in that layer.
See if the MSE gets any better. If not, stop there.
If it did, double again, and repeat.
You could then try moving to multiple layers. But not much point using a bigger first layer than the best you can get from trying a single layer.
Since your dataset has 51,200 features, and your layers array has 50,000 as the first value, 51200 * 50000 == 2.56e9 weights in that first set of network connections.
It’s too many, try smaller numbers.

Is there a way to make this code faster and if possible avoid loops?

A1, B1, C1, A2, B2 and C2 are 6 matrix with the same dimensions 4435X2000.
I have to find the values i, j and k for which A1(k,2000) == A2(i,j) and B1(k,2000) == B2(i,j) and C1(k,2000) == C2(i,j) , with the condition X(k)==1 and Y(i,j)==1
The objective is to find: counter, L, T and D
Is there a way to make this code faster? Can I avoid loops?
counter=0;
L(1)=0;
T(1)=0;
D(1)=0;
for k=1:4435
if X(k)==1 % X is a vector (4435x1)
F(k,:) = [A1(k,2000) B1(k,2000) C1(k,2000)]
for i=1:4435
for j=100:1999
if Y(i,j)==1 % Y is a matrix (4435x1999)
if F(k,:) == [A2(i,j) B2(i,j) C2(i,j)]
counter = counter+1;
L(counter)=k;
T(counter)=i;
D(counter)=j;
end
end
end
end
end
end
I want a solution that will save me at least 80% of the computation time!
and not have the error message: Out of memory
See how this works out for you -
%// Store X-Y data by calling X() and Y() functions
X_data = X(1:4435);
Y_data = Y(1:4435,100:1999);
range1 = 100:1999 %// define range for columns
A2 = A2(:,range1); %// Crop out A2, B2, C2 based on column-range
B2 = B2(:,range1);
C2 = C2(:,range1);
Y_data = Y_data(:,range1)==1;
%// Indices for dim-3
idx_X = find(X_data==1)
%// Map X==1 onto A1, B1, C1
A1Lr = A1(X_data==1,end)
B1Lr = B1(X_data==1,end)
C1Lr = C1(X_data==1,end)
%// Setup output array to store L, T, D as single Nx3 output array
out = zeros(sum(Y_data(:))*numel(A1Lr),3);
%// Try out(sum(Y_data(:)==1)*numel(A1Lr),3)=0; instead for speed!
%// Start collecting output indices
count = 1;
for iter1 = 1:numel(A1Lr)
[R,C] = find(Y_data & A2==A1Lr(iter1) & B2==B1Lr(iter1) & C2==C1Lr(iter1));
nR = numel(R);
out(count:count+nR-1,:) = [R C repmat(iter1,nR,1)];
count = count + nR;
end
out(find(out(:,1)==0,1):end,:)=[];
%// Packup the outputs
T = out(:,1)
D = out(:,2) + range1(1)-1
L = idx_X(out(:,3))
It is very difficult to determine what your code is actually supposed to accomplish, without really working to interpret your code. However, I'll give it a crack:
% Determine where X is true.
XTrue = X == 1;
% Extract values from A1,B1,C1 where X is true.
F ( XTrue , 1 : 3 ) = [ A1(XTrue,2000) B1(XTrue,2000) C1(XTrue,2000) ];
% Determine where Y is true.
YTrueIndex = find ( Y == 1 );
% Determine where the extracted values match
counter = [];
L = [];
T = [];
D = [];
for ( ii = 1 : length(YTrueIndex) )
indexCurrent = YTrueIndex(ii)
FRowsThatMatch = F(:,1)==A2(indexCurrent) & F(:,2)==B2(indexCurrent) & F(:,3)==C2(indexCurrent);
matchCount = length ( find ( FRowsThatMatch ) );
if ( matchCount > 0 )
counter = counter + matchCount;
[ i , j ] = ind2sub ( size ( Y ) , indexCurrent );
L = [ L , find ( FRowsThatMatch ) ];
T = [ T , ones(matchCount,1)*i ];
D = [ D , ones(matchCount,2)*j ];
end
end

defining a color injection to value on matrix , in image( ) R

In image() , how to define an injection of the colour to the value on matrix ?
I'm writing a programme to make colour chart on (x,y) coordinate-plane
Shows the hazard ratio from a value of x-axis and y-axis.
(eg.) x-axis is for "age", and y-axis is for "blood pressure", and if the third or upper dimension (z) is there, one colour chart is made for each z unique value.
I'm trying image(), but it isn't what I want because the colour on chart is defined on each (x,y)coordinate-plane, so only the (x,y) effect is represented.
(the effect of z-value is treated like as intercept.)
I think that to define injection of the colour to the value in my function would solve
this problem.
In my function , I define a matrix
mat.for.map[i,j] <- exp ( yax[i] * yest
+ xax[j] * xest
+ zax * zest
+ cat1
+ cat2 )
'xax' is sequence for x-axis
xax <- seq ( xlow , xup , length.out = xlen )
'yax' is for y-axis
yax <- seq(ylow,yup,length.out = ylen)
'xest' is hazard for variable-x
'yest' is for variable-y
and I got a dataset of hazars ratio
hazards <- c( 0.1302 , 0.0154 , -0.0030 , 0.5971 , 0.3773
, 0.1300 , 0.0159 , -0.0017 , 1.1522 , 0.3390
, 0.1037 , 0.0133, 0.0121 , 1.2249 , 0.3647
, 0.1480 , 0.0045 , 0.0034 , 1.5109 , 0.7472)
rowname1 <- c( "age" , "sbp" , "TotCho" , "DM" , "smk" )
colname2 <- c( "Stoke" , "CHD")
colname1 <- c( "men" , "women" )
array.hazards <- array(hazards, c(5,2,2))
dimnames(hazards)<- list(rowname1,colname1,colname2)
and if the third or upper dimention is defined
in argument 'z'(value) , 'zest'(hazard) and 'cat1' , 'cat2' ,
they are added to the values in 'mat.for.map'.
THIS IS MY PROBLEM
when I change 'z' or 'cat1' or 'cat2' ,the values in 'mat.for.map' changes ,
but the colours on chart does not.
My function is below
chart.make <- function(xest ,yest,
zest, xup,xlow,xlen
,yup,ylow,ylen
,zup=NULL,zlow=NULL,zlen=NULL,
xlab=NULL,ylab=NULL,
cat1=0,cat2=0) {
xax <- seq(xlow,xup,length.out = xlen)
yax <- seq(ylow,yup,length.out = ylen)
zax <- z
mat.for.map <- matrix(c(rep(0,length(xax)*length(yax))),
ncol=length(yax))
for(j in 1:length(xax)) {
for(i in 1:length(yax))
mat.for.map[i,j] <- exp(xax[j]*xest+yax[i]*yest+zax*zest+cat1+cat2)
}
list.chart <- list(xax,yax,mat.for.map)
names(list.chart) <- c("x","y","z")
return (image(list.chart,col=rainbow(100),xlab=xlab,ylab=ylab,useRaster = TRUE))
}
For example
hazards <- c( 0.1302 , 0.0154 , -0.0030 , 0.5971 , 0.3773
, 0.1300 , 0.0159 , -0.0017 , 1.1522 , 0.3390
, 0.1037 , 0.0133, 0.0121 , 1.2249 , 0.3647
, 0.1480 , 0.0045 , 0.0034 , 1.5109 , 0.7472)
rowname1 <- c( "age" , "sbp" , "TotCho" , "DM" , "smk" )
colname2 <- c( "Stoke" , "CHD")
colname1 <- c( "men" , "women" )
array.hazards <- array(hazards, c(5,2,2))
dimnames(array.hazards)<- list(rowname1,colname1,colname2)
pdf("chart1.pdf")
z1 <- chart.make(
array.hazards["sbp","women","CHD"],array.hazards["TotCho","women","CHD"]
,zest=array.hazards["age","women","CHD"]
,240,90,30
,279,160,6,zup=75,
,ylab="Total Cholesterol",xlab="Sistlic Blood Pressure"
,cat1=array.hazards["smk","women","CHD"])
mtext(text="CHD in women 70<age<80,smoker,DM",side=3,cex=1)
dev.off()
pdf("chart2.pdf")
z1 <- chart.make(
array.hazards["sbp","women","CHD"],array.hazards["TotCho","women","CHD"]
,zest=array.hazards["age","women","CHD"]
,240,90,30
,279,160,6,zup=75,
,ylab="Total Cholesterol",xlab="Sistlic Blood Pressure"
,cat1=array.hazards["smk","women","CHD"]
,cat2=array.hazards["DM","women","CHD"])
mtext(text="CHD in women 70<age<80,smoker",side=3,cex=1)
dev.off()
As I comment at Paul Hiemstra, I'm sorry I can't show you a example of what I want ,so I present a simlified one.
x <- seq(1:200,by=2)
z <- 20
mat1 <- matrix(x,ncol=10)
mat2 <- matrix(x:z,ncol=10)
mat3 <- matrix(x+z,ncol=10)
image.1 <- image(mat1,col=rainbow(100))
image.2 <- image(mat2,col=rainbow(100))
image.3 <- image(mat3,col=rainbow(100))
I think my problem is equal to 'image.1 is same to image.3,the effect of adding z is not reflected in colour on chart.'
Thank you very much for your time. (I'm afraid of being closed)
A quick attempt:
library(ggplot2)
ggplot(melt(array4hm), aes(x=X1,y=X2,fill=value))+geom_tile()+facet_grid(.~X3)

Applying nlminb to subsets of data (by index or label) and store what the program returns as a new data frame

I was wondering if anyone could kindly help me with this seemingly easy task. I'm using nlminb to conduct optimization and compute some statistics by index. Here's an example from nlminb help.
> x <- rnbinom(100, mu = 10, size = 10)
> hdev <- function(par) {
+ -sum(dnbinom(x, mu = par[1], size = par[2], log = TRUE))
+ }
> nlminb(c(9, 12), hdev)
$par
[1] 9.730000 5.954936
$objective
[1] 297.2074
$convergence
[1] 0
$message
[1] "relative convergence (4)"
$iterations
[1] 10
$evaluations
function gradient
12 27
Suppose I generate random variables x, y, and z where z acts as an index (from 1 to 3).
> x <- rnbinom(100, mu = 10, size = 10)
> y <- rnbinom(100, mu = 10, size = 10)
> z <- rep(1:3, length=100)
> A <- cbind(x,y,z)
> hdev <- function(par) {
+ -sum(dnbinom(x+y, mu = par[1], size = par[2], log = TRUE))}
How can I apply nlminb(c(9, 12), hdev) to the data set by index z? In other words, I would like to compute nlminb(c(9, 12), hdev) for z=1, z=2, and z=3 separately. I tried by(A, z, function(A) nlminb(c(9,12), hdev)) and sparseby(A, z, function(A) nlminb(c(9,12), hdev)), but they return exactly the same values for each value of z.
I would like to turn each output into a new data frame so that it will become a 3X2 matrix.
[1] Z1_ANSWER_1 Z1_ANSWER_2
[2] Z2_ANSWER_1 Z2_ANSWER_2
[3] Z3_ANSWER_1 Z3_ANSWER_2
Since nlminb returns the summary of statistics, I needed to use CASEZ1<-nlminb$par, CASEZ2<-nlminb$par, CASEZ3<-nlminb$par and then use cbind to combine them. However, I would like to automate this process as the real data I'm working on has a lot more categories than z presented here.
If I'm not making myself clear, please let me know. I'll see if I can replicate the actual data set and functions I'm working on (I just don't have them on this computer).
Thank you very much in advance.
Let me try an approach
x <- rnbinom(100, mu = 10, size = 10)
y <- rnbinom(100, mu = 10, size = 10)
z <- rep(1:3, length=100)
A <- as.data.frame(cbind(x,y,z))
At first load the plyr library
library(plyr)
The following code returns the results for each z
dlply(A, .(z), function(x) {
hdev <- function(par, mydata) {-sum(dnbinom(mydata, mu = par[1], size = par[2], log = TRUE))}
nlminb(c(9, 12), hdev, mydata=t(as.vector(x[1] + as.vector(x[2]))))
}
)
Now, with this one you will get a 3x2 dataframe with the $par results
ddply(A, .(z), function(x) {
hdev <- function(par, mydata) {-sum(dnbinom(mydata, mu = par[1], size = par[2], log = TRUE))}
res <- nlminb(c(9, 12), hdev, mydata=t(as.vector(x[1] + as.vector(x[2]))))
return(res$par)
}
)

Resources