what to do to make shinyapp give me my output - rstudio

The primary variable is AgeGroup which has 2 levels. I am trying to get the sample size to output, but for some reason the app either gives error or wont output anything. Can anyone help? The are some comments in the code to help with confusion
Code:
library(shiny)
library(shinyWidgets)
library(survival)
library(shinyjs)
library(survminer)
# Define UI for application that draws a histogram
ui <- fluidPage(
# Application title
titlePanel("ProMBA Haslam Ad Sample Size"),
#Put in all key5 inputs as numeric inputs that the user will type in and choose starting, default values for these inputs
tabPanel("Inputs",
div( id ="form",
column(4,
numericInput("power", label=h6("Power"), value = .9),
numericInput("alpha", label=h6("Alpha"), value = .05),
numericInput("precision", label=h6("Precision"), value =0.05),
numericInput("Delta", label=h6("Delta"), value=.3),
column(4,
numericInput("sample", label=h6("Starting Sample Size"), value = 40),
numericInput("reps", label=h6("Number of Replications"), value=1000)),
),
column(4,
#title of output
h4("Calculated Sample Size"),
verbatimTextOutput(("n"),placeholder=TRUE)),
#create action buttons for users to run the form and to reset the form
textOutput("Sample Size (n)"),
column(4,
actionButton("action","Calculate"))
)))
server = function(input,output,session){
buttonGo = eventReactive(input$action, {withProgress(message = "Running", {
#relist the key inputs and save them to be able to be used in the rest of the code
n<-input$sample/2
alpha<-input$alpha
power <- input$power
beta<-1-input$power
precision<-input$precision
delta <- input$Delta +1
rep <- input$reps
nincrease<-10
#manually5 load in the data from the baseline data .xlxs file
Reporting <- c("12/13/21","12/14/21","12/15/21","12/16/21","12/17/21","12/18/21","12/19/21","12/20/21","12/21/21","12/22/21","12/23/21","12/24/21","12/25/21","12/26/21","12/27/21","12/28/21","12/29/21","12/30/21","12/31/21","1/1/22")
AdSet <- "Status Quo"
Results <- c(70,52, 33, 84, 37, 41, 22, 53, 78, 66, 100, 110, 52, 43, 63, 84, 16, 64, 21, 69)
ResultIndicator <- "actions:link_click"
Budget <- 100
CostPerClick<- c(1.43, 1.92, 3.03, 1.19, 2.70, 2.44, 4.55, 1.89, 1.28, 1.52, 1.00, 0.91, 1.92, 2.33, 1.59, 1.19, 6.25, 1.56, 4.76, 1.45)
Impressions <- c(7020, 8430, 5850, 7920, 6890, 7150, 6150, 7370, 8440, 6590, 6750,8720, 6410,7720, 6940, 8010, 7520, 7190, 6540, 6020)
df <- data.frame(Reporting, AdSet, Results, ResultIndicator,Budget,CostPerClick,Impressions)
#define the standard deviation of the results as well as the mean for group 1 of the 2 level variable and the mean for group 2
mean1 = mean(df$Results)
sd1 = sd(df$Results)
mean2 = delta*mean1
click=rep(0,n)
#Create 2 level variable
AgeGroup <- rep(c("Age21-35","Age36-50"),each=n)
#create new data frame with 2 level variable and click repetitions
DataFrame2 <- data.frame(AgeGroup,click)
#create new data frame binding all of the input variables together
DataFrame3 <- data.frame(cbind(n,alpha,power,precision,delta,rep))
#create for loop to find the pvalue of the ttest run with click~AgeGroup
trials=function(){
for(i in 1:nrow(DataFrame2)){
if(any(DataFrame2$AgeGroup[i]=="Age21-35")){DataFrame2$click[i] =rnorm(1,mean1,sd1)}else{DataFrame2$click[i] =rnorm(1,mean2,sd1)}
}
pvalttest=t.test(click~AgeGroup, data=DataFrame2)
return(pvalttest$p.value)
}
p_values=replicate(200,trials())
p_values=replicate(input$rep,trials())
#find if the p value is significance
significance=p_values[p_values<alpha]
#find the power of the signifiance and the pvalue
power <- length(significance)/length(p_values)
print(c(power,n))
#run a while loop to find the n within the goal power limits
goalpower<-1-beta
lowergoal<-goalpower-input$precision
uppergoal<-goalpower+input$precision
while (power<lowergoal||power>uppergoal){
if (power<lowergoal){
n=n+nincrease
AgeGroup=c()
click=c()
AgeGroup=rep(c("Age21-35","Age36-50"), each=n)
click=rep(NA,2*n)
Dataframe2=data.frame(AgeGroup,click)
p_values=replicate(input$reps, trials())
significance=p_values[p_values<alpha]
power=length(significance)/length(p_values)
print(c(n, power))
}else{
nincrease=nincrease%/%(10/9) #%/% fixes issue of rounding
n=n-nincrease
AgeGroup=c()
click=c()
AgeGroup=rep(c("Age21-35","Age36-50"), each=n)
click=rep(NA,2*n)
DataFrame2=data.frame(AgeGroup,click)
p_values=replicate(input$reps, trials())
significance=p_values[p_values<alpha]
power=length(significance)/length(p_values)
print(c(n, power))
}
}
#n is defined as the sample size of one of the levels of the 2 level variable, so mulitply by 2 to get full sample size
n*2
})
}
shinyApp(ui, server)
i dont need the app to be pretty. I just want it to run whenever someone clicks the calculate button

Related

Fitting Lightgbm distributed with lgb.train hangs

I'm trying to learn how to use lightgbm distributed.
I wrote a simple hello world kind of code where I use iris dataset with 150 rows, split it into train (100 rows) and test(50 rows). Then training the train test set are further split into two parts. Each part is fed into two machines with appropriate rank.
The problem I see is that lgb.train hangs.
Here is the code:
import argparse
import logging
import lightgbm as lgb
import pandas as pd
from sklearn import datasets
import socket
print('lightgbm', lgb.__version__)
HOST = socket.gethostname()
ip_address = socket.gethostbyname(HOST)
print("IP=", ip_address)
# looks like lightgbm operates only with ip addresses
IPS = ['10.121.22.166', '10.121.22.83']
assert ip_address in IPS
logger = logging.getLogger(__name__)
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 10000)
pd.set_option('max_colwidth', 100)
pd.set_option('precision', 5)
def read_train_data(rank):
iris = datasets.load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
partition = rank
assert partition < 2
separate = 100
train_df = iris_df.iloc[:separate]
test_df = iris_df.iloc[separate:]
separate_train = 60
separate_test = 30
if partition == 0:
train_df = train_df.iloc[:separate_train]
test_df = test_df.iloc[:separate_test]
else:
train_df = train_df.iloc[separate_train:]
test_df = test_df.iloc[separate_test:]
def get_lgb_dataset(df):
target_column = df.columns[-1]
columns = df.columns[:-1]
assert target_column not in columns
print('Target column', target_column)
x = df[columns]
y = df[target_column]
print(x)
ds = lgb.Dataset(free_raw_data=False, data=x, label=y, params={
"enable_bundle": False
})
ds.construct()
return ds
dtrain = get_lgb_dataset(train_df)
dtest = get_lgb_dataset(test_df)
return dtrain, dtest
def train(args):
port0 = 56456
rank = IPS.index(ip_address)
print("Rank=", rank, HOST)
print("RR", rank)
dtrain, dtest = read_train_data(rank=rank)
params = {'boosting_type': 'gbdt',
'class_weight': None,
'colsample_bytree': 1.0,
'importance_type': 'split',
'learning_rate': 0.1,
'max_depth': 2,
'min_child_samples': 20,
'min_child_weight': 0.001,
'min_split_gain': 0.0,
'n_estimators': 1,
'num_leaves': 31,
'objective': 'regression',
'metric': 'rmse',
'random_state': None,
'reg_alpha': 0.0,
'reg_lambda': 0.0,
'silent': False,
'subsample': 1.0,
'subsample_for_bin': 200000,
'subsample_freq': 0,
'tree_learner': 'data_parallel',
'num_threads': 48,
'machines': ','.join([f'{machine}:{port0}' for i, machine in enumerate(IPS)]),
'local_listen_port': port0,
'time_out': 120,
'num_machines': len(IPS)
}
print(params)
logging.info("starting to train lgb at node with rank %d", rank)
evals_result = {}
if args.scikit == 1:
print("Using scikit learn")
bst = lgb.sklearn.LGBMRegressor(**params)
bst.fit(
dtrain.data,
dtrain.label,
eval_set=[(dtest.data, dtest.label)],
)
else:
print("Using regular LGB")
bst = lgb.train(params,
dtrain,
valid_sets=[dtest],
evals_result=evals_result)
print(evals_result)
logging.info("finish xgboost training at node with rank %d", rank)
return bst
def main(args):
logging.info("starting the train job")
model = train(args)
pd.set_option('display.max_rows', 500)
print("OUT", model.__class__)
try:
print(model.trees_to_dataframe())
except:
print(model.booster_.trees_to_dataframe())
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--scikit',
help='scikit',
default=0,
type=int,
)
main(parser.parse_args())
I can run it with the scikit fit interface by running: python simple_distributed_lgb_test.py --scikit 1
On the two machines. It produces a reasonable result.
However, when I use -- scikit 0 (which uses lgb.train), then fitting just hangs on both nodes. Last messages before it hangs:
[LightGBM] [Info] Total Bins 22
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 2
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Start training from score 0.873750
Is that a bug or an expected behavior? dask.py in lightgbm does use scikit learn fit interface.
I use an overnight master version 3.2.1.99. 5b7a6f3e7150aeb704d1dd2b852d246af3e913a3 tag to be exact from Jul 12.
UPDATE 1
I'm trying to dig into the code. So far I see few things:
scikit.train interface appears to have an extra syncronization step before fitting first tree. lgb.train doesn't have it. Dunno yet where it comes from. (I see some Network::Allreduce operations)
It appears that scikit.train has workers syncronized - each worker knows the correct sizes of the blocks to send and receive during reducescatter operations. For example one the first allreduce worker1 sends 208 blocks and receives 368 blocks of data (in Linkers::SendRecv), while worker2 is reversed - sends 368 and receives 208. So allreduce completes fine. ()
On the contrary, lgb.train has workers not syncronized - each worker has numbers for send and receive blocks during reducescatter at the first DataParallelTreeLearner::FindBestSplits encounter. But they don't match. Worker1 sends 208 abd wants to receive 400. Worker2 sends 192 and wants to receive 176. So, the worker that wants to receive more just hangs. The other worker eventually hangs too.
Possibly it has something to do with lgb.Dataset. That thing may need to have same bins or something. I tried to force it by forcedbins_filename parameter. But it doesn't seem to help with lgb.train.
UPDATE 2
Success. If I remove the following line from the example:
ds.construct()
Everything works. So I guess we can't use construct on Dataset when using distributed training.

Am getting error trying to predict on a single image CNN pytorch

Error message
Traceback (most recent call last):
File "pred.py", line 134, in
output = model(data)
Runtime Error: Expected 4-dimensional input for 4-dimensional weight [16, 3, 3, 3], but got 3-dimensional input of size [1, 32, 32] instead.
Prediction code
normalize = transforms.Normalize(mean=[0.4914, 0.4824, 0.4467],
std=[0.2471, 0.2435, 0.2616])
train_set = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
])
model = models.condensenet(args)
model = nn.DataParallel(model)
PATH = "results/savedir/save_models/checkpoint_001.pth.tar"
model.load_state_dict(torch.load(PATH)['state_dict'])
device = torch.device("cpu")
model.eval()
image = Image.open("horse.jpg")
input = train_set(image)
train_loader = torch.utils.data.DataLoader(
input,
batch_size=1,shuffle=True, num_workers=1)
for i, data in enumerate(train_loader):
#input_var = torch.autograd.Variable(data, volatile=True)
#input_var = input_var.view(1, 3, 32,32)
**output = model(data)
topk=(1,5)
maxk = max(topk)
_, pred = output.topk(maxk, 1, True, True)
Am getting this error when am trying to predict on a single image
Image shape/size error message
Link to saved model
Training code repository
Plz uncomment this line #input_var = input_var.view(1, 3, 32,32) so that your input dimension is 4.
I assume that your no. of input channels are 3 if its one then use input_var = input_var.view(1, 1, 32,32) if gray scale
Instead of doing the for loop and train_loader, solved this by just passing the input directly into the model. like this
input = train_set(image)
input = input.unsqueeze(0)
model.eval()
output = model(input)
More details can be found here link

Use of sum function to get a value in sparkR

I have a DataFrame 'data' in sparkR which contains ID= 1,2,.. and amount= 232, 303, 444, 10, ...
I want to check if the sum of amount is greater than 5000.
sum(data$amount ) > 5000
Now sparkR should return TRUE if its TRUE and FALSE otherwise but all I get is this message
Column (SUM(amount)>5000)
How can I check if it's true?
It might not be the best possible solution, but it works. You did create a column of 1 item, but I did not find a way to get the result stored in that item, therefor I applied a different approach:
df <- data.frame(ID=c(1,2,3,4),amount=c(232, 303, 444, 10))
data <- createDataFrame(sqlContext,df)
data <- withColumn(data, "constant", data$ID * 0)
sumFrame <- agg(groupBy(data, data$constant), sumAmount = sum(data$amount))
localResult <- collect(sumFrame)
localResult$sumAmount > 5000
With this approach, I create a DataFrame of 1 row, but a DataFrame is collectable to obtain the result.

Realm Cocoa: skipped item when iterating through RLMArray

I have a very strange issue when iterating through RLMArray. I do something like this:
let tickets = Ticket.objectsWhere("pendingSync = true")
for ticket in tickets {
print("1. " + ticket.id + ",")
}
realm.beginWriteTransaction()
for ticket in tickets {
let ticket = ticket as Ticket
ticket.pendingSync = false
print("2. " + ticket.id + ",")
}
realm.commitWriteTransaction()
Strangely enough, the output is
1. 125, 1. 127, 1. 123,
2. 125, 2. 123
and NOT
1. 125, 1. 127, 1. 123,
2. 125, 2. 127, 2. 123
Shortly, the second iteration skips one item. How is this possible? Any ideas?
Thanks!
Mutating items during enumeration is not currently supported in Realm, but will be supported in the upcoming 0.95 release.

how to calculate correlation with a sliding window?

I have a zoo object called aux with yearly data from 1961 to 2009:
x$nao x[, 2]
1961 -0.03 63.3
1962 0.20 155.9
1963 -2.98 211.0
I want to calculate the correlation between the two columns using a 20 years sliding window. I am trying to use rollapply, but I don't seem to be able to make it work. I tried several different ways of doing it but always without success...
> rollapply(aux,20, cor(aux[,1],aux[,2],method="pearson"))
Error in match.fun(FUN) : 'cor(aux[, 1], aux[, 2], method = "pearson")' is not a function, character or symbol
> rollapply(aux,20, cor,method="pearson")
Error in FUN(coredata(data)[posns], ...) : supply both 'x' and 'y' or a matrix-like 'x'
> rollapply(aux,20, cor)
Error in FUN(coredata(data)[posns], ...) : supply both 'x' and 'y' or a matrix-like 'x'
Can anybody tell me how to make rollapply work?
Thanks for helping!
Try this.
library(quantmod)
library(TTR)
#Set the seed so results can be duplicated
set.seed(123)
#Build a zoo object with typical price data
var1 <- zoo(cumprod(1+rnorm(50, 0.01, 0.05)), seq(1961, 2001, 1))
var2 <- zoo(cumprod(1+rnorm(50, 0.015, 0.1)), seq(1961, 2001, 1))
dat <- merge(var1=var1, var2=var2)
plot(dat)
grid()
#Calculate the percent returns for the two prices
del1 <- Delt(dat$var1)
del2 <- Delt(dat$var2)
dat <- merge(dat, del1=del1, del2=del2)
dimnames(dat)[[2]][3] <- "del1"
dimnames(dat)[[2]][4] <- "del2"
head(dat)
plot(dat)
#Calculate the correlation between the two returns using a 5 year sliding window
delcor <- runCor(dat$del1, dat$del2, n=5, use="all.obs", sample=TRUE, cumulative=FALSE)
dat <- merge(dat, delcor)
plot(dat$delcor, type="l", main="Sliding Window Correlation of Two Return Series", xlab="", col="red")
grid()

Resources