Use of sum function to get a value in sparkR - sparkr

I have a DataFrame 'data' in sparkR which contains ID= 1,2,.. and amount= 232, 303, 444, 10, ...
I want to check if the sum of amount is greater than 5000.
sum(data$amount ) > 5000
Now sparkR should return TRUE if its TRUE and FALSE otherwise but all I get is this message
Column (SUM(amount)>5000)
How can I check if it's true?

It might not be the best possible solution, but it works. You did create a column of 1 item, but I did not find a way to get the result stored in that item, therefor I applied a different approach:
df <- data.frame(ID=c(1,2,3,4),amount=c(232, 303, 444, 10))
data <- createDataFrame(sqlContext,df)
data <- withColumn(data, "constant", data$ID * 0)
sumFrame <- agg(groupBy(data, data$constant), sumAmount = sum(data$amount))
localResult <- collect(sumFrame)
localResult$sumAmount > 5000
With this approach, I create a DataFrame of 1 row, but a DataFrame is collectable to obtain the result.

Related

what to do to make shinyapp give me my output

The primary variable is AgeGroup which has 2 levels. I am trying to get the sample size to output, but for some reason the app either gives error or wont output anything. Can anyone help? The are some comments in the code to help with confusion
Code:
library(shiny)
library(shinyWidgets)
library(survival)
library(shinyjs)
library(survminer)
# Define UI for application that draws a histogram
ui <- fluidPage(
# Application title
titlePanel("ProMBA Haslam Ad Sample Size"),
#Put in all key5 inputs as numeric inputs that the user will type in and choose starting, default values for these inputs
tabPanel("Inputs",
div( id ="form",
column(4,
numericInput("power", label=h6("Power"), value = .9),
numericInput("alpha", label=h6("Alpha"), value = .05),
numericInput("precision", label=h6("Precision"), value =0.05),
numericInput("Delta", label=h6("Delta"), value=.3),
column(4,
numericInput("sample", label=h6("Starting Sample Size"), value = 40),
numericInput("reps", label=h6("Number of Replications"), value=1000)),
),
column(4,
#title of output
h4("Calculated Sample Size"),
verbatimTextOutput(("n"),placeholder=TRUE)),
#create action buttons for users to run the form and to reset the form
textOutput("Sample Size (n)"),
column(4,
actionButton("action","Calculate"))
)))
server = function(input,output,session){
buttonGo = eventReactive(input$action, {withProgress(message = "Running", {
#relist the key inputs and save them to be able to be used in the rest of the code
n<-input$sample/2
alpha<-input$alpha
power <- input$power
beta<-1-input$power
precision<-input$precision
delta <- input$Delta +1
rep <- input$reps
nincrease<-10
#manually5 load in the data from the baseline data .xlxs file
Reporting <- c("12/13/21","12/14/21","12/15/21","12/16/21","12/17/21","12/18/21","12/19/21","12/20/21","12/21/21","12/22/21","12/23/21","12/24/21","12/25/21","12/26/21","12/27/21","12/28/21","12/29/21","12/30/21","12/31/21","1/1/22")
AdSet <- "Status Quo"
Results <- c(70,52, 33, 84, 37, 41, 22, 53, 78, 66, 100, 110, 52, 43, 63, 84, 16, 64, 21, 69)
ResultIndicator <- "actions:link_click"
Budget <- 100
CostPerClick<- c(1.43, 1.92, 3.03, 1.19, 2.70, 2.44, 4.55, 1.89, 1.28, 1.52, 1.00, 0.91, 1.92, 2.33, 1.59, 1.19, 6.25, 1.56, 4.76, 1.45)
Impressions <- c(7020, 8430, 5850, 7920, 6890, 7150, 6150, 7370, 8440, 6590, 6750,8720, 6410,7720, 6940, 8010, 7520, 7190, 6540, 6020)
df <- data.frame(Reporting, AdSet, Results, ResultIndicator,Budget,CostPerClick,Impressions)
#define the standard deviation of the results as well as the mean for group 1 of the 2 level variable and the mean for group 2
mean1 = mean(df$Results)
sd1 = sd(df$Results)
mean2 = delta*mean1
click=rep(0,n)
#Create 2 level variable
AgeGroup <- rep(c("Age21-35","Age36-50"),each=n)
#create new data frame with 2 level variable and click repetitions
DataFrame2 <- data.frame(AgeGroup,click)
#create new data frame binding all of the input variables together
DataFrame3 <- data.frame(cbind(n,alpha,power,precision,delta,rep))
#create for loop to find the pvalue of the ttest run with click~AgeGroup
trials=function(){
for(i in 1:nrow(DataFrame2)){
if(any(DataFrame2$AgeGroup[i]=="Age21-35")){DataFrame2$click[i] =rnorm(1,mean1,sd1)}else{DataFrame2$click[i] =rnorm(1,mean2,sd1)}
}
pvalttest=t.test(click~AgeGroup, data=DataFrame2)
return(pvalttest$p.value)
}
p_values=replicate(200,trials())
p_values=replicate(input$rep,trials())
#find if the p value is significance
significance=p_values[p_values<alpha]
#find the power of the signifiance and the pvalue
power <- length(significance)/length(p_values)
print(c(power,n))
#run a while loop to find the n within the goal power limits
goalpower<-1-beta
lowergoal<-goalpower-input$precision
uppergoal<-goalpower+input$precision
while (power<lowergoal||power>uppergoal){
if (power<lowergoal){
n=n+nincrease
AgeGroup=c()
click=c()
AgeGroup=rep(c("Age21-35","Age36-50"), each=n)
click=rep(NA,2*n)
Dataframe2=data.frame(AgeGroup,click)
p_values=replicate(input$reps, trials())
significance=p_values[p_values<alpha]
power=length(significance)/length(p_values)
print(c(n, power))
}else{
nincrease=nincrease%/%(10/9) #%/% fixes issue of rounding
n=n-nincrease
AgeGroup=c()
click=c()
AgeGroup=rep(c("Age21-35","Age36-50"), each=n)
click=rep(NA,2*n)
DataFrame2=data.frame(AgeGroup,click)
p_values=replicate(input$reps, trials())
significance=p_values[p_values<alpha]
power=length(significance)/length(p_values)
print(c(n, power))
}
}
#n is defined as the sample size of one of the levels of the 2 level variable, so mulitply by 2 to get full sample size
n*2
})
}
shinyApp(ui, server)
i dont need the app to be pretty. I just want it to run whenever someone clicks the calculate button

How to speed up the addition of a new column in pandas, based on comparisons on an existing one

I am working on a large-ish dataframe collection with some machine data in several tables. The goal is to add a column to every table which expresses the row's "class", considering its vicinity to a certain time stamp.
seconds = 1800
for i in range(len(tables)): # looping over 20 equally structured tables containing machine data
table = tables[i]
table['Class'] = 'no event'
for event in events[i].values: # looping over 20 equally structured tables containing events
event_time = event[1] # get integer time stamp
start_time = event_time - seconds
table.loc[(table.Time<=event_time) & (table.Time>=start_time), 'Class'] = 'event soon'
The event_times and the entries in table.Time are integers. The point is to assign the class "event soon" to all rows in a specific time frame before an event (the number of seconds).
The code takes quite long to run, and I am not sure what is to blame and what can be fixed. The amount of seconds does not have much impact on the runtime, so the part where the table is actually changed is probabaly working fine and it may have to do with the nested loops instead. However, I don't see how to get rid of them. Hopefully, there is a faster, more pandas way to go about adding this class column.
I am working with Python 3.6 and Pandas 0.19.2
You can use numpy broadcasting to do this vectotised instead of looping
Dummy data generation
num_tables = 5
seconds=1800
def gen_table(count):
for i in range(count):
times = [(100 + j)**2 for j in range(i, 50 + i)]
df = pd.DataFrame(data={'Time': times})
yield df
def gen_events(count, num_tables):
for i in range(num_tables):
times = [1E4 + 100 * (i + j )**2 for j in range(count)]
yield pd.DataFrame(data={'events': times})
tables = list(gen_table(num_tables)) # a list of 5 DataFrames of length 50
events = list(gen_events(5, num_tables)) # a list of 5 DataFrames of length 5
Comparison
For debugging, I added a dict of verification DataFrames. They are not needed, I just used them for debugging
verification = {}
for i, (table, event_df) in enumerate(zip(tables, events)):
event_list = event_df['events']
time_diff = event_list.values - table['Time'].values[:,np.newaxis] # This is where the magic happens
events_close = np.any( (0 < time_diff) & (time_diff < seconds), axis=1)
table['Class'] = np.where(events_close, 'event soon', 'no event')
# The stuff after this line can be deleted since it's only used for the verification
df = pd.DataFrame(data=time_diff, index=table['Time'], columns=event_list)
df['event'] = np.any((0 < time_diff) & (time_diff < seconds), axis=1)
verification[i] = df
newaxis
A good explanation on broadcasting is in Jakevdp's book
table['Time'].values[:,np.newaxis]
gives a (50,1) 2-d array
array([[10000],
[10201],
[10404],
....
[21609],
[21904],
[22201]], dtype=int64)
Verification
For the first step the verification df looks like this:
events 10000.0 10100.0 10400.0 10900.0 11600.0 event
Time
10000 0.0 100.0 400.0 900.0 1600.0 True
10201 -201.0 -101.0 199.0 699.0 1399.0 True
10404 -404.0 -304.0 -4.0 496.0 1196.0 True
10609 -609.0 -509.0 -209.0 291.0 991.0 True
10816 -816.0 -716.0 -416.0 84.0 784.0 True
11025 -1025.0 -925.0 -625.0 -125.0 575.0 True
11236 -1236.0 -1136.0 -836.0 -336.0 364.0 True
11449 -1449.0 -1349.0 -1049.0 -549.0 151.0 True
11664 -1664.0 -1564.0 -1264.0 -764.0 -64.0 False
11881 -1881.0 -1781.0 -1481.0 -981.0 -281.0 False
12100 -2100.0 -2000.0 -1700.0 -1200.0 -500.0 False
12321 -2321.0 -2221.0 -1921.0 -1421.0 -721.0 False
12544 -2544.0 -2444.0 -2144.0 -1644.0 -944.0 False
....
20449 -10449.0 -10349.0 -10049.0 -9549.0 -8849.0 False
20736 -10736.0 -10636.0 -10336.0 -9836.0 -9136.0 False
21025 -11025.0 -10925.0 -10625.0 -10125.0 -9425.0 False
21316 -11316.0 -11216.0 -10916.0 -10416.0 -9716.0 False
21609 -11609.0 -11509.0 -11209.0 -10709.0 -10009.0 False
21904 -11904.0 -11804.0 -11504.0 -11004.0 -10304.0 False
22201 -12201.0 -12101.0 -11801.0 -11301.0 -10601.0 False
Small optimizations of original answer.
You can shave a few lines and some assignments of the original algorithm
for table, event_df in zip(tables, events):
table['Class'] = 'no event'
for event_time in event_df['events']: # looping over 20 equally structured tables containing events
start_time = event_time - seconds
table.loc[table['Time'].between(start_time, event_time), 'Class'] = 'event soon'
You might shave some more if instead of the text 'no event' and 'event soon' you would just use booleans

volemont/insights:chart.EquityCurve.R: a bug in graphing peaks of cumulative return?

I came cross a function of graphing cumulative return of a strategy and the peaks of the return in a great example of combining shiny and quantstrat, thanks to Simon Otziger. The source code is here. The code works fine most of time, but for some data it won't graph the peaks properly.
The code is simplified but the key logic is not changed. I ran the code with three set of data (cumPNL1, cumPNL2, cumPNL3) copied from three example strategies, in which the first data will cause the code to fail to graph peaks properly.
I ran the following codes with cumPNL1, cumPNL2, cumPNL3 separately. with both cumPNL2 and cumPNL3 the code can produce cumulative return line and peak points successfully. however, with cumPNL1 the code can only produce line, but peaks are not at the right positions.
I noticed that both peakIndex based on cumPNL2 and cumPNL3 have their first value being TRUE, so when I change the code by adding a line peakIndex[1] <- TRUE, cumPNL1 will work fine with the modified code.
Though now it works with modified code, I have no idea why it is behaving like this. Could anyone have a look? Thanks
cumPNL1 <- c(-193,-345,-406,-472,-562,-543,-450,-460,-544,-659,-581,-342,-384,276,-858,-257.99)
cumPNL2 <- c(35.64,4.95,-2.97,-6.93,11.88,-19.8,-26.73,-39.6,-49.5,-50.49,-51.48,-48.51,-50.49,-55.44,143.55,770.22,745.47,691.02,847.44,1141.47,1007.82,1392.93,1855.26,1863.18,2536.38,2778.93,2811.6,2859.12,2417.58)
cumPNL3 <- c(35.64,4.95,-2.97,-6.93,11.88,-19.8,-26.73,-39.6,-49.5,-50.49,-51.48,-48.51,-50.49,-55.44,143.55,770.22,745.47,691.02,847.44,1141.47,1007.82,1392.93,1855.26,1863.18,2536.38,2778.93,2811.6,2859.12,2417.58)
peakIndex <- c(cumPNL3[1] > 0, diff(cummax(cumPNL3)) > 0)
# peakIndex[1] <- TRUE
dev.new()
plot(cumPNL3, type='n', xlab="index of trades", ylab="returns in cash", main="cumulative returns and peaks")
grid()
lines(cumPNL3)
points(cbind(1 : length(cumPNL3), cumPNL3)[peakIndex, ],
pch=19, col='green', cex=0.6)
legend(
x='bottomright', inset=0.1,
legend=c('Net Profit','Peaks'),
lty=c(1, NA), pch=c(NA, 19),
col=c('black','green')
)
cumPNL1 has a single peak and R reduces the dimension from a numerical matrix to a numerical vector of length 2. The points function plots the two numerical vector values on the y-axis using the x-axis index 1 and 2:
peakIndex1 <- c(cumPNL1[1] > 0, diff(cummax(cumPNL1)) > 0)
peakIndex3 <- c(cumPNL3[1] > 0, diff(cummax(cumPNL3)) > 0)
str(cbind(1 : length(cumPNL1), cumPNL1)[peakIndex1,])
str(cbind(1 : length(cumPNL3), cumPNL3)[peakIndex3,])
Output:
> str(cbind(1 : length(cumPNL1), cumPNL1)[peakIndex1,])
num [1:12, 1:2] 1 15 16 19 20 22 23 24 25 26 ...
- attr(*, "dimnames")=List of 2
..$ : NULL
..$ : chr [1:2] "" "cumPNL1"
> str(cbind(1 : length(cumPNL3), cumPNL3)[peakIndex3,])
Named num [1:2] 14 276
- attr(*, "names")= chr [1:2] "" "cumPNL3"
Usually setting plot = FALSE preserves the object, e.g., str(cbind(1 : length(cumPNL3), cumPNL3)[peakIndex3, drop = FALSE]), which somehow does not work in this case. However, changing the points line to the following fixes the problem:
points(seq_along(cumPNL3)[peakIndex], cumPNL3[peakIndex], pch = 19,
col = 'green', cex = 0.6)
Thanks for reporting the issue. I will push the fix to GitHub tomorrow.

how to calculate correlation with a sliding window?

I have a zoo object called aux with yearly data from 1961 to 2009:
x$nao x[, 2]
1961 -0.03 63.3
1962 0.20 155.9
1963 -2.98 211.0
I want to calculate the correlation between the two columns using a 20 years sliding window. I am trying to use rollapply, but I don't seem to be able to make it work. I tried several different ways of doing it but always without success...
> rollapply(aux,20, cor(aux[,1],aux[,2],method="pearson"))
Error in match.fun(FUN) : 'cor(aux[, 1], aux[, 2], method = "pearson")' is not a function, character or symbol
> rollapply(aux,20, cor,method="pearson")
Error in FUN(coredata(data)[posns], ...) : supply both 'x' and 'y' or a matrix-like 'x'
> rollapply(aux,20, cor)
Error in FUN(coredata(data)[posns], ...) : supply both 'x' and 'y' or a matrix-like 'x'
Can anybody tell me how to make rollapply work?
Thanks for helping!
Try this.
library(quantmod)
library(TTR)
#Set the seed so results can be duplicated
set.seed(123)
#Build a zoo object with typical price data
var1 <- zoo(cumprod(1+rnorm(50, 0.01, 0.05)), seq(1961, 2001, 1))
var2 <- zoo(cumprod(1+rnorm(50, 0.015, 0.1)), seq(1961, 2001, 1))
dat <- merge(var1=var1, var2=var2)
plot(dat)
grid()
#Calculate the percent returns for the two prices
del1 <- Delt(dat$var1)
del2 <- Delt(dat$var2)
dat <- merge(dat, del1=del1, del2=del2)
dimnames(dat)[[2]][3] <- "del1"
dimnames(dat)[[2]][4] <- "del2"
head(dat)
plot(dat)
#Calculate the correlation between the two returns using a 5 year sliding window
delcor <- runCor(dat$del1, dat$del2, n=5, use="all.obs", sample=TRUE, cumulative=FALSE)
dat <- merge(dat, delcor)
plot(dat$delcor, type="l", main="Sliding Window Correlation of Two Return Series", xlab="", col="red")
grid()

Speed up the analysis

I have 2 dataframes in R for example df and dfrefseq.
df<-data.frame( chr = c("chr1","chr1","chr1","chr4")
, start = c(843294,4329248,4329423,4932234)
, stop = c(845294,4329248,4529423,4935234)
, genenames= c("HTA","OdX","FEA","MGA")
)
dfrefseq<-data.frame( chr = c("chr1","chr1","chr1","chr2")
, start = c(843294,4329248,4329423,4932234)
, stop = c(845294,4329248,4529423,4935234)
, genenames= c("tra","FGE","FFs","FAA")
)
I want to check for each gene in df witch gene in dfrefseq lies closest to the selected df gene.
I first selected "chr1" in both dataframes.
Then I calculated for the first gene in readschr1 the distance between start-start start-stop stop-start and stop-stop sites.
The sum of this calculations say everything about the distance. My question here is, How can I speed up this analyse? Because now I tested only 1 gene against a dataframe, but I need to test 2000 genes.
readschr1 <- subset(df,df[,1]=="chr1")
refseqchr1 <- subset(dfrefseq,dfrefseq[,1]=="chr1")
names<-list()
read_start_start<-list()
read_start_stop<-list()
read_stop_start<-list()
read_stop_stop<-list()
for (i in 1:nrow(refseqchr1)) {
startstart<-abs(readschr1[1,2] - refseqchr1[i,2])
startstop<-abs(readschr1[1,2] - refseqchr1[i,3])
stopstart<-abs(readschr1[1,3] - refseqchr1[i,2])
stopstop<-abs(readschr1[1,3] - refseqchr1[i,3])
read_start_start[[i]]<- matrix(startstart)
read_start_stop[[i]]<- matrix(startstop)
read_stop_start[[i]]<- matrix(stopstart)
read_stop_stop[[i]]<- matrix(stopstop)
names[[i]]<-matrix(refseqchr1[i,4])
}
table<-cbind(names, read_start_start, read_start_stop, read_stop_start, read_stop_stop)
sumtotalcolumns<-as.numeric(table[,2]) + as.numeric(table[,3])+ as.numeric(table[,4]) + as.numeric(table[,5])
test<-cbind(table, sumtotalcolumns)
test1<-test[order(as.vector(test$sumtotalcolumns)), ]
Thank you!
The Bioconductor package GenomicRanges is designed to work with this type of data
source('http://bioconductor.org/biocLite.R')
biocLite('GenomicRanges') # one-time installation
then
library(GenomicRanges)
gr <- with(df,
GRanges(factor(chr, levels=paste("chr", 1:4, sep="")),
IRanges(start, stop), genenames=genenames))
grrefseq <- with(dfrefseq,
GRanges(factor(chr, levels=paste("chr", 1:4, sep="")),
IRanges(start, stop), genenames=genenames))
and
> nearest(gr, grrefseq)
[1] 1 2 3 NA
You can merge the two separate data.frames together to form one table and then use vectorized operations. The key to merge is to specify the common column(s) between the data.frames and to tell it what to do when there are cases that do not match. Specifying all = TRUE will return all rows and fill in NAs if there is no match in the other data.frame, i.e. ch2 and ch4 in this case. Once the data.frames have been merged, then it's a simple exercise in subtracting the different columns from one another and then summing the four columns of interest. I use transform to cut down on the typing needed to do the subtraction.
zz <- merge(df, dfrefseq, by = "chr", all = TRUE)
zz <- transform(zz,
read_start_start = abs(start.x - start.y)
, read_start_stop = abs(start.x - stop.y)
, read_stop_start = abs(stop.x - start.y)
, read_stop_stop = abs(stop.x - stop.y)
)
zz <- transform(zz,
sum_total_columns = read_start_start + read_start_stop + read_stop_start + read_stop_stop
)
Here's one approach get the row with the minimum distance. I'm assuming you want to do this by chr and genenames. I use the plyr package, but I'm sure there are base solutions if you'd prefer one of those. Maybe someone else will chime in with a base solution.
require(plyr)
ddply(zz, c("chr", "genenames.x"), function(x) x[which.min(x$sum_total_columns) ,])

Resources