Data Frame Subset Performance - performance

I have a couple of large data frames (1 million+ rows x 6-10 columns) I need to subset repeatedly. The subsetting section is the slowest part of my code and I curious if there is way to do this faster.
load("https://dl.dropbox.com/u/4131944/Temp/DF_IOSTAT_ALL.rda")
start_in <- strptime("2012-08-20 13:00", "%Y-%m-%d %H:%M")
end_in<- strptime("2012-08-20 17:00", "%Y-%m-%d %H:%M")
system.time(DF_IOSTAT_INT <- DF_IOSTAT_ALL[DF_IOSTAT_ALL$date_stamp >= start_in & DF_IOSTAT_ALL$date_stamp <= end_in,])
> system.time(DF_IOSTAT_INT <- DF_IOSTAT_ALL[DF_IOSTAT_ALL$date_stamp >= start_in & DF_IOSTAT_ALL$date_stamp <= end_in,])
user system elapsed
16.59 0.00 16.60
dput(head(DF_IOSTAT_ALL))
structure(list(date_stamp = structure(list(sec = c(14, 24, 34,
44, 54, 4), min = c(0L, 0L, 0L, 0L, 0L, 1L), hour = c(0L, 0L,
0L, 0L, 0L, 0L), mday = c(20L, 20L, 20L, 20L, 20L, 20L), mon = c(7L,
7L, 7L, 7L, 7L, 7L), year = c(112L, 112L, 112L, 112L, 112L, 112L
), wday = c(1L, 1L, 1L, 1L, 1L, 1L), yday = c(232L, 232L, 232L,
232L, 232L, 232L), isdst = c(1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst"
), class = c("POSIXlt", "POSIXt")), cpu = c(0.9, 0.2, 0.2, 0.1,
0.2, 0.1), rsec_s = c(0, 0, 0, 0, 0, 0), wsec_s = c(0, 3.8, 0,
0.4, 0.2, 0.2), util_pct = c(0, 0.1, 0, 0, 0, 0), node = c("bda101",
"bda101", "bda101", "bda101", "bda101", "bda101")), .Names = c("date_stamp",
"cpu", "rsec_s", "wsec_s", "util_pct", "node"), row.names = c(NA,
6L), class = "data.frame")

I would use xts for this. The only potential hiccup is that xts is a matrix with an ordered index attribute, so you can't mix types like you can in a data.frame.
If the node column is invariant, you can just exclude it from your xts object:
library(xts)
x <- xts(DF_IOSTAT_ALL[,2:5], as.POSIXct(DF_IOSTAT_ALL$date_stamp))
x["2012-08-20 00:00:24/2012-08-20 00:00:54"]
Update using the OP's actual data:
Data <- DF_IOSTAT_ALL
# change node from character to numeric,
# so it can exist in the xts object too.
Data$node <- as.numeric(gsub("^bda","",Data$node)
# create the xts object
x <- xts(Data[,-1], as.POSIXct(Data$date_stamp))
# subset one day
system.time(x['2012-08-20 13:00/2012-08-20 17:00'])
# user system elapsed
# 0 0 0
# subset 13:00-17:00 for all days
system.time(x['T13:00/T17:00'])
# user system elapsed
# 2.64 0.00 2.66

Here are my experiments with data.table. Interestingly, just the conversion to data.table will make your lookups faster, possibly through more efficient lookup to the logical vectors. I compared four things: the original data frame lookup; a lookup with conversion from POSIXlt to POSIXct (thanks to Matthew Dowle); the data table lookup; and the data table lookup in addition to the setup of copy and conversion. Even with the additional setup, the data table lookup wins. With multiple lookups, you'll get even more savings in time.
library(data.table)
library(rbenchmark)
load("DF_IOSTAT_ALL.rda")
DF_IOSTAT_ALL.original <- DF_IOSTAT_ALL
start_in <- strptime("2012-08-20 13:00", "%Y-%m-%d %H:%M")
end_in<- strptime("2012-08-20 17:00", "%Y-%m-%d %H:%M")
#function to test: original
fun <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.original[DF_IOSTAT_ALL.original$date_stamp >= start_in & DF_IOSTAT_ALL.original$date_stamp <= end_in,]
#function to test: changing to POSIXct
DF_IOSTAT_ALL.ct <- within(DF_IOSTAT_ALL.original,date_stamp <- as.POSIXct(date_stamp))
fun.ct <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.ct[with(DF_IOSTAT_ALL.ct,date_stamp >= start_in & date_stamp <= end_in),]
#function to test: with data.table and POSIXct
DF_IOSTAT_ALL.dt <- as.data.table(DF_IOSTAT_ALL.ct);
fun.dt <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.dt[date_stamp >= start_in & date_stamp <= end_in,]
#function to test: with data table and POSIXct, with setup steps
newfun <- function() {
DF_IOSTAT_ALL <- DF_IOSTAT_ALL.original;
#data.table doesn't play well with POSIXlt, so convert to POSIXct
DF_IOSTAT_ALL$date_stamp <- as.POSIXct(DF_IOSTAT_ALL$date_stamp);
DF_IOSTAT_ALL <- data.table(DF_IOSTAT_ALL);
DF_IOSTAT_INT <<- DF_IOSTAT_ALL[date_stamp >= start_in & date_stamp <= end_in,];
}
benchmark(fun(), fun.ct(), fun.dt(), newfun(), replications=3,order="relative")
# test replications elapsed relative user.self sys.self user.child sys.child
#3 fun.dt() 3 0.18 1.000000 0.11 0.08 NA NA
#2 fun.ct() 3 0.52 2.888889 0.44 0.08 NA NA
#4 newfun() 3 35.49 197.166667 34.88 0.58 NA NA
#1 fun() 3 66.68 370.444444 66.42 0.15 NA NA
If you know what your time intervals are beforehand, you can probably make it even faster by splitting with findInterval or cut and keying/indexing the table.
DF_IOSTAT_ALL <- copy(DF_IOSTAT_ALL.new)
time.breaks <- strptime.d("2012-08-19 19:00:00") + 0:178 * 60 * 60 #by hour
DF_IOSTAT_ALL[,interval := findInterval(date_stamp,time.breaks)]
setkey(DF_IOSTAT_ALL,interval)
start_in <- time.breaks[60]
end_in <- time.breaks[61]
benchmark(a <- DF_IOSTAT_ALL[J(60)],b <- fun2(DF_IOSTAT_ALL))
# test replications elapsed relative user.self sys.self user.child sys.child
#1 DF_IOSTAT_ALL[J(60)] 100 0.78 1.000000 0.64 0.14 NA NA
#2 fun2(DF_IOSTAT_ALL) 100 6.69 8.576923 5.76 0.91 NA NA
all.equal(a,b[,.SD,.SDcols=c(12,1:11,13)]) #test for equality (rearranging columns to match)
#TRUE

Related

CONVERT to XT format from matrix format

I have a matrix X: 131*29 format, I want to convert it to "xts object" in year pattern. I have limited R language knowledge.
library(xts)
data <- matrix(rnorm(n = 131 * 29), ncol = 29)
timeindex <-
seq.Date(from = as.Date("1900-01-01"),
by = "years",
length.out = 131)
myxts <- xts(x = data, order.by = timeindex)
> head(myxts[, c(1:3)])
[,1] [,2] [,3]
1900-01-01 -0.05270081 0.2177585 -0.01717907
1901-01-01 -0.50514942 -0.7325821 0.90430048
1902-01-01 -0.04710051 -0.9079267 0.53016427
1903-01-01 0.23658303 -1.2000134 0.99272586
1904-01-01 0.67484632 -2.3547689 1.44805954
1905-01-01 1.22531681 1.2777569 0.43727875

GEKKO Exception: #error: Max Equation Length (Number of variables greater than 100k)

I need to run an optimization for 100k to 500k variables, but it gives me max equation length reached an error. Can anyone help me out to set up this problem? Time is not a constraint as long as it takes 3-4 hours to run, it's fine.
df1 = df_opt.head(100000).copy()
#initialize model
m= GEKKO()
m.options.SOLVER=1
#initialize variable
x = np.array([m.Var(lb=0,ub=100,integer=True) for i in range(len(df1))])
#constraints
m.Equation(m.sum(x)<=30000)
#objective
responsiveness = np.array([m.Const(i) for i in df1['responsivness'].values])
affinity_score = np.array([m.Const(i) for i in df1['affinity'].values])
cost = np.array([m.Const(i) for i in df1['cost'].values])
expr = np.array([m.log(i) - k * j \
for i,j,k in zip((1+responsiveness * affinity_score * x),x,cost)])
m.Obj(-(m.sum(expr)))
#optimization
m.solve(disp=False)
When creating a question, it is important to have a Minimal Example that is complete. Here is a modification that creates a random DataFrame with n rows.
from gekko import GEKKO
import numpy as np
import pandas as pd
n = 10
df1 = pd.DataFrame({'responsivness':np.random.rand(n),\
'affinity':np.random.rand(n),\
'cost':np.random.rand(n)})
print(df1.head())
#initialize model
m= GEKKO(remote=False)
m.options.SOLVER=1
#initialize variable
x = np.array([m.Var(lb=0,ub=100,integer=True) for i in range(len(df1))])
#constraints
m.Equation(m.sum(x)<=30000)
#objective
responsiveness = np.array([m.Const(i) for i in df1['responsivness'].values])
affinity_score = np.array([m.Const(i) for i in df1['affinity'].values])
cost = np.array([m.Const(i) for i in df1['cost'].values])
expr = np.array([m.log(i) - k * j \
for i,j,k in zip((1+responsiveness * affinity_score * x),x,cost)])
m.Obj(-(m.sum(expr)))
#optimization
m.solve(disp=True)
This solves successfully for n=10 with the random numbers selected.
--------- APM Model Size ------------
Each time step contains
Objects : 0
Constants : 30
Variables : 11
Intermediates: 0
Connections : 0
Equations : 2
Residuals : 2
Number of state variables: 11
Number of total equations: - 1
Number of slack variables: - 1
---------------------------------------
Degrees of freedom : 9
----------------------------------------------
Steady State Optimization with APOPT Solver
----------------------------------------------
Iter: 1 I: 0 Tm: 0.00 NLPi: 20 Dpth: 0 Lvs: 3 Obj: -1.35E+00 Gap: NaN
--Integer Solution: -1.34E+00 Lowest Leaf: -1.35E+00 Gap: 4.73E-03
Iter: 2 I: 0 Tm: 0.00 NLPi: 2 Dpth: 1 Lvs: 3 Obj: -1.34E+00 Gap: 4.73E-03
Successful solution
---------------------------------------------------
Solver : APOPT (v1.0)
Solution time : 1.519999999436550E-002 sec
Objective : -1.34078995171088
Successful solution
---------------------------------------------------
The underlying model gk_model0.apm can be accessed by navigating to m.path or by using m.open_folder().
Model
Constants
i0 = 0.14255660947333681
i1 = 0.9112789578520111
i2 = 0.10526966142004568
i3 = 0.6255161023214897
i4 = 0.2434604974789274
i5 = 0.812768922376058
i6 = 0.555163868440599
i7 = 0.7286240480266872
i8 = 0.39643651685899695
i9 = 0.4664238475079081
i10 = 0.588654005219946
i11 = 0.7807594551372589
i12 = 0.623910408858981
i13 = 0.19421798736230456
i14 = 0.3061420839190525
i15 = 0.07764492888189267
i16 = 0.7276569154297892
i17 = 0.5630014016669598
i18 = 0.9633171115575193
i19 = 0.23310692223695684
i20 = 0.008089496373502647
i21 = 0.7533529530133879
i22 = 0.4218710975774087
i23 = 0.03329287687223692
i24 = 0.9136665338169284
i25 = 0.7528330460265494
i26 = 0.0810779357870034
i27 = 0.4183140612726107
i28 = 0.4381547602657835
i29 = 0.907339329732971
End Constants
Variables
int_v1 = 0, <= 100, >= 0
int_v2 = 0, <= 100, >= 0
int_v3 = 0, <= 100, >= 0
int_v4 = 0, <= 100, >= 0
int_v5 = 0, <= 100, >= 0
int_v6 = 0, <= 100, >= 0
int_v7 = 0, <= 100, >= 0
int_v8 = 0, <= 100, >= 0
int_v9 = 0, <= 100, >= 0
int_v10 = 0, <= 100, >= 0
End Variables
Equations
(((((((((int_v1+int_v2)+int_v3)+int_v4)+int_v5)+int_v6)+int_v7)+int_v8)+int_v9)+int_v10)<=30000
minimize (-((((((((((log((1+((((i0)*(i10)))*(int_v1))))-((i20)*(int_v1)))+(log((1+((((i1)*(i11)))*(int_v2))))-((i21)*(int_v2))))+(log((1+((((i2)*(i12)))*(int_v3))))-((i22)*(int_v3))))+(log((1+((((i3)*(i13)))*(int_v4))))-((i23)*(int_v4))))+(log((1+((((i4)*(i14)))*(int_v5))))-((i24)*(int_v5))))+(log((1+((((i5)*(i15)))*(int_v6))))-((i25)*(int_v6))))+(log((1+((((i6)*(i16)))*(int_v7))))-((i26)*(int_v7))))+(log((1+((((i7)*(i17)))*(int_v8))))-((i27)*(int_v8))))+(log((1+((((i8)*(i18)))*(int_v9))))-((i28)*(int_v9))))+(log((1+((((i9)*(i19)))*(int_v10))))-((i29)*(int_v10)))))
End Equations
End Model
You can avoid a large symbolic expression string by modifying the model as:
from gekko import GEKKO
import numpy as np
import pandas as pd
n = 5000
df1 = pd.DataFrame({'responsiveness':np.random.rand(n),\
'affinity':np.random.rand(n),\
'cost':np.random.rand(n)})
print(df1.head())
#initialize model
m= GEKKO(remote=False)
m.options.SOLVER=1
#initialize variable
x = np.array([m.Var(lb=0,ub=100,integer=True) for i in range(len(df1))])
#constraints
m.Equation(m.sum(list(x))<=30000)
#objective
responsiveness = df1['responsiveness'].values
affinity_score = df1['affinity'].values
cost = df1['cost'].values
[m.Maximize(m.log(i) - k * j) \
for i,j,k in zip((1+responsiveness * affinity_score * x),x,cost)]
#optimization
m.solve(disp=True)
m.open_folder()
This gives an underlying model of the following that does not increase in symbolic expression size with number of variables.
Model
Variables
int_v1 = 0, <= 100, >= 0
int_v2 = 0, <= 100, >= 0
int_v3 = 0, <= 100, >= 0
int_v4 = 0, <= 100, >= 0
int_v5 = 0, <= 100, >= 0
int_v6 = 0, <= 100, >= 0
int_v7 = 0, <= 100, >= 0
int_v8 = 0, <= 100, >= 0
int_v9 = 0, <= 100, >= 0
int_v10 = 0, <= 100, >= 0
v11 = 0
End Variables
Equations
v11<=30000
maximize (log((1+((0.16283879947305288)*(int_v1))))-((0.365323493448101)*(int_v1)))
maximize (log((1+((0.3509872155181691)*(int_v2))))-((0.12162206443479917)*(int_v2)))
maximize (log((1+((0.20134572143617518)*(int_v3))))-((0.47137701674279087)*(int_v3)))
maximize (log((1+((0.287818142242232)*(int_v4))))-((0.12042554857067544)*(int_v4)))
maximize (log((1+((0.48997709502894166)*(int_v5))))-((0.21084485862098745)*(int_v5)))
maximize (log((1+((0.6178277437136291)*(int_v6))))-((0.42602122419609056)*(int_v6)))
maximize (log((1+((0.13033555293152563)*(int_v7))))-((0.8796057438355324)*(int_v7)))
maximize (log((1+((0.5002025885707916)*(int_v8))))-((0.9703263879586648)*(int_v8)))
maximize (log((1+((0.7095523321888202)*(int_v9))))-((0.8498606490337451)*(int_v9)))
maximize (log((1+((0.6174815809937886)*(int_v10))))-((0.9390903075640681)*(int_v10)))
End Equations
Connections
int_v1 = sum_1.x[1]
int_v2 = sum_1.x[2]
int_v3 = sum_1.x[3]
int_v4 = sum_1.x[4]
int_v5 = sum_1.x[5]
int_v6 = sum_1.x[6]
int_v7 = sum_1.x[7]
int_v8 = sum_1.x[8]
int_v9 = sum_1.x[9]
int_v10 = sum_1.x[10]
v11 = sum_1.y
End Connections
Objects
sum_1 = sum(10)
End Objects
End Model
I fixed a bug in Gekko so you should be able to use m.Equation(m.sum(x)<=30000) on the next release of Gekko instead of converting x to a list. This modification now works for larger models that previously failed. I tested it with n=5000.
Number of state variables: 5002
Number of total equations: - 2
Number of slack variables: - 1
---------------------------------------
Degrees of freedom : 4999
----------------------------------------------
Steady State Optimization with APOPT Solver
----------------------------------------------
Iter: 1 I: 0 Tm: 313.38 NLPi: 14 Dpth: 0 Lvs: 3 Obj: -6.05E+02 Gap: NaN
--Integer Solution: -6.01E+02 Lowest Leaf: -6.05E+02 Gap: 6.60E-03
Iter: 2 I: 0 Tm: 0.06 NLPi: 2 Dpth: 1 Lvs: 3 Obj: -6.01E+02 Gap: 6.60E-03
Successful solution
---------------------------------------------------
Solver : APOPT (v1.0)
Solution time : 313.461699999985 sec
Objective : -600.648283994940
Successful solution
---------------------------------------------------
The solution time increases to 313.46 sec. There is also more processing time to compile the model. You may want to start with smaller models and check how much it will increase the computational time. I also recommend that you use remote=False to solve locally instead of on the remote server.
Integer optimization problems can take exponentially longer with more variables so you'll want to ensure that you aren't starting a problem that will require 30 years to complete. A good way to check this is solve successively larger problems to get an idea of the scale-up.

How can I create a specific time interval in Ruby?

What I have tried so far ...
start_hour = 7
start_minute = 0 * 0.01
end_hour = 17
end_minute = 45 * 0.01
step_time = 25
start_time = start_hour + start_minute
end_time = end_hour + end_minute
if step_time > 59
step_time = 1 if step_time == 60
step_time = 1.3 if step_time == 90
step_time = 2 if step_time == 120
else
step_time *= 0.01
end
hours = []
(start_time..end_time).step(step_time).map do |x|
next if (x-x.to_i) > 0.55
hours << '%0.2f' % x.round(2).to_s
end
puts hours
If I enter the step interval 0, 5, 10, 20, I can get the time interval I want. But if I enter 15, 25, 90, I can't get the right range.
You currently have:
end_hour = 17
end_minute = 45 * 0.01
end_time = end_hour + end_minute
#=> 17.45
Although 17.45 looks like the correct value, it isn't. 45 minutes is 3 quarters (or 75%) of an hour, so the correct decimal value is 17.75.
You could change your code accordingly, but working with decimal hours is a bit strange. It's much easier to just work with minutes. Instead of turning the minutes into hours, you turn the hours into minutes:
start_hour = 7
start_minute = 0
start_time = start_hour * 60 + start_minute
#=> 420
end_hour = 17
end_minute = 45
end_time = end_hour * 60 + end_minute
#=> 1065
The total amount of minutes can easily be converted back to hour-minute pairs via divmod:
420.divmod(60) #=> [7, 0]
1065.divmod(60) #=> [17, 45]
Using the above, we can traverse the range without having to convert the step interval:
def hours(start_time, end_time, step_time)
(start_time..end_time).step(step_time).map do |x|
'%02d:%02d' % x.divmod(60)
end
end
hours(start_time, end_time, 25)
#=> ["07:00", "07:25", "07:50", "08:15", "08:40", "09:05", "09:30", "09:55",
# "10:20", "10:45", "11:10", "11:35", "12:00", "12:25", "12:50", "13:15",
# "13:40", "14:05", "14:30", "14:55", "15:20", "15:45", "16:10", "16:35",
# "17:00", "17:25"]
hours(start_time, end_time, 90)
#=> ["07:00", "08:30", "10:00", "11:30", "13:00", "14:30", "16:00", "17:30"]

Most efficient way of subsetting dataframes

Can anyone suggest more efficient way of subsetting dataframe without using SQL/indexing/data.table options?
I looked for similar questions, and this one suggests indexing option.
Here are ways to subset with timings.
#Dummy data
dat <- data.frame(x = runif(1000000, 1, 1000), y=runif(1000000, 1, 1000))
#Subset and time
system.time(x <- dat[dat$x > 500, ])
# user system elapsed
# 0.092 0.000 0.090
system.time(x <- dat[which(dat$x > 500), ])
# user system elapsed
# 0.040 0.032 0.070
system.time(x <- subset(dat, x > 500))
# user system elapsed
# 0.108 0.004 0.109
EDIT:
As Roland suggested I used microbenchmark. It seems which performs the best.
library("ggplot2")
library("microbenchmark")
#Dummy data
dat <- data.frame(x = runif(1000000, 1, 1000), y=runif(1000000, 1, 1000))
#Benchmark
res <- microbenchmark( dat[dat$x > 500, ],
dat[which(dat$x > 500), ],
subset(dat, x > 500))
#plot
autoplot.microbenchmark(res)
As Roland suggested I used microbenchmark. It seems which performs the best.
library("ggplot2")
library("microbenchmark")
#Dummy data
dat <- data.frame(x = runif(1000000, 1, 1000), y=runif(1000000, 1, 1000))
#Benchmark
res <- microbenchmark( dat[dat$x > 500, ],
dat[which(dat$x > 500), ],
subset(dat, x > 500))
#plot
autoplot.microbenchmark(res)

Fastest way to count occurrences of each unique element

What is the fastest way to compute the number of occurrences for each unique element in a vector in R?
So far, I've tried the following five functions:
f1 <- function(x)
{
aggregate(x, by=list(x), FUN=length)
}
f2 <- function(x)
{
r <- rle(x)
aggregate(r$lengths, by=list(r$values), FUN=sum)
}
f3 <- function(x)
{
u <- unique(x)
data.frame(Group=u, Counts=vapply(u, function(y)sum(x==y), numeric(1)))
}
f4 <- function(x)
{
r <- rle(x)
u <- unique(r$values)
data.frame(Group=u, Counts=vapply(u, function(y)sum(r$lengths[r$values==y]), numeric(1)))
}
f5 <- function(x)
{
as.data.frame(unclass(rle(sort(x))))[,2:1]
}
Some of them do not give the result sorted by category, but that is not important. Here are the results(used package microbenchmark):
> x <- sample(1:100, size=1e3, TRUE); microbenchmark(f1(x), f2(x), f3(x), f4(x), f5(x))
Unit: microseconds
expr min lq median uq max neval
f1(x) 4133.353 4230.3700 4272.5985 4394.1895 7038.420 100
f2(x) 4464.268 4549.8180 4615.3465 4728.1995 7457.435 100
f3(x) 1032.064 1063.0080 1091.7670 1135.4525 3824.279 100
f4(x) 4748.950 4801.3725 4861.2575 4947.3535 7831.308 100
f5(x) 605.769 696.9615 714.9815 729.5435 3411.817 100
>
> x <- sample(1:100, size=1e4, TRUE); microbenchmark(f1(x), f2(x), f3(x), f4(x), f5(x))
Unit: milliseconds
expr min lq median uq max neval
f1(x) 25.057491 25.739892 25.937021 26.321998 27.875918 100
f2(x) 27.223552 27.718469 28.023355 28.537022 30.584403 100
f3(x) 5.361635 5.458289 5.537650 5.657967 8.261243 100
f4(x) 35.341726 35.841922 36.299161 38.012715 70.096613 100
f5(x) 2.158415 2.248881 2.281826 2.384304 4.793000 100
>
> x <- sample(1:100, size=1e5, TRUE); microbenchmark(f1(x), f2(x), f3(x), f4(x), f5(x), times=10)
Unit: milliseconds
expr min lq median uq max neval
f1(x) 236.53630 240.93358 242.88631 244.33994 250.75403 10
f2(x) 261.03280 263.61096 264.67032 265.81852 297.92244 10
f3(x) 53.94873 55.59020 59.05662 61.05741 87.23288 10
f4(x) 385.10217 390.44888 396.40572 399.23762 432.47262 10
f5(x) 18.31358 18.53492 18.84327 20.22700 20.34385 10
>
> x <- sample(1:100, size=1e6, TRUE); microbenchmark(f1(x), f2(x), f3(x), f4(x), f5(x), times=3)
Unit: milliseconds
expr min lq median uq max neval
f1(x) 2559.0462 2568.7480 2578.4498 2693.3116 2808.1734 3
f2(x) 2833.2622 2881.9241 2930.5860 2946.7877 2962.9895 3
f3(x) 743.6939 748.3331 752.9723 778.9532 804.9341 3
f4(x) 4471.8494 4544.6490 4617.4487 4696.2698 4775.0909 3
f5(x) 243.8903 253.2481 262.6058 269.1038 275.6018 3
>
> x <- sample(1:1000, size=1e6, TRUE); microbenchmark(f1(x), f2(x), f3(x), f4(x), f5(x), times=3)
Unit: milliseconds
expr min lq median uq max neval
f1(x) 2614.7104 2634.9312 2655.1520 2701.6216 2748.0912 3
f2(x) 3038.0353 3116.7499 3195.4645 3197.7423 3200.0202 3
f3(x) 6488.7268 6508.6495 6528.5722 6836.9738 7145.3754 3
f4(x) 40244.5038 40653.2633 41062.0229 41200.1973 41338.3717 3
f5(x) 244.2052 245.0331 245.8609 273.3307 300.8006 3
> x <- sample(1:10000, size=1e6, TRUE); microbenchmark(f1(x), f2(x), f3(x), f4(x), f5(x), times=3) # SLOW!
Unit: milliseconds
expr min lq median uq max neval
f1(x) 3279.2146 3300.7527 3322.2908 3338.6000 3354.9091 3
f2(x) 3563.5244 3578.3302 3593.1360 3597.2246 3601.3132 3
f3(x) 61303.6299 61928.4064 62553.1830 63089.5225 63625.8621 3
f4(x) 398792.7769 400346.2250 401899.6732 490921.6791 579943.6850 3
f5(x) 261.1835 263.7766 266.3697 287.3595 308.3494 3
(The last comparison is really slow, takes a couple minutes to run).
Apparently, the winner is f5, but I'd like to see if it can be outperformed...
EDIT: Considering suggestions f6 by #eddi, f8 by #AdamHyland (modified) and f9 by #dickoa, here are the new results:
f6 <- function(x)
{
data.table(x)[, .N, keyby = x]
}
f8 <- function(x)
{
fac <- factor(x)
data.frame(x = levels(fac), freq = tabulate(as.integer(fac)))
}
f9 <- plyr::count
Results:
> x <- sample(1:1e4, size=1e6, TRUE); microbenchmark(f5(x), f6(x), f8(x), f9(x), times=10)
Unit: milliseconds
expr min lq median uq max neval
f5(x) 291.8189 292.69771 293.2349 293.91216 296.3622 10
f6(x) 96.5717 96.73662 96.8249 99.25542 150.1081 10
f8(x) 659.3281 663.85092 669.6831 672.43613 699.4790 10
f9(x) 284.2978 296.41822 301.3535 331.92510 346.5567 10
> x <- sample(1:1e3, size=1e7, TRUE); microbenchmark(f5(x), f6(x), f8(x), f9(x), times=10)
Unit: milliseconds
expr min lq median uq max neval
f5(x) 3190.2555 3224.4201 3264.415 3359.823 3464.782 10
f6(x) 980.1287 989.9998 1051.559 1056.484 1085.580 10
f8(x) 5092.5847 5142.3289 5167.101 5244.400 5348.513 10
f9(x) 2799.6125 2843.1189 2881.734 2977.116 3081.437 10
So data.table is the winner! - so far :-)
p.s. I had to modify f8 to allow inputs like c(5,2,2,10), where not all integer from 1 to max(x) are present.
This is a little slower than tabulate, but is more universal (it will work with characters, factors, basically whatever you throw at it) and much easier to read/maintain/expand.
library(data.table)
f6 = function(x) {
data.table(x)[, .N, keyby = x]
}
x <- sample(1:1000, size=1e7, TRUE)
system.time(f6(x))
# user system elapsed
# 0.80 0.07 0.86
system.time(f8(x)) # tabulate + dickoa's conversion to data.frame
# user system elapsed
# 0.56 0.04 0.60
UPDATE: As of data.table version 1.9.3, the data.table version is actually about 2x faster than tabulate + data.frame conversion.
There's almost nothing that will beat tabulate() provided you can meet the initial conditions.
x <- sample(1:100, size=1e7, TRUE)
system.time(tabulate(x))
# user system elapsed
# 0.071 0.000 0.072
#dickoa adds a few more notes in the comments as to how to get the appropriate output, but tabulate as a workhorse function is the way to go.

Resources