Calculate summary statistics grouped by variable
dby(
data,
INPUT,
...,
ID = NULL,
ORDER = NULL,
SUBSET = NULL,
SORT = 0,
COMBINE = !REDUCE,
NOCHECK = FALSE,
ARGS = NULL,
NAMES,
COLUMN = FALSE,
REDUCE = FALSE,
REGEX = mets.options()$regex,
ALL = TRUE
)
Data.frame
Input variables (character or formula)
functions
id variable
(optional) order variable
(optional) subset expression
sort order (id+order variable)
If TRUE result is appended to data
No sorting or check for missing data
Optional list of arguments to functions (...)
Optional vector of column names
If TRUE do the calculations for each column
Reduce number of redundant rows
Allow regular expressions
if FALSE only the subset will be returned
Calculate summary statistics grouped by
dby2 for column-wise calculations
n <- 4
k <- c(3,rbinom(n-1,3,0.5)+1)
N <- sum(k)
d <- data.frame(y=rnorm(N),x=rnorm(N),id=rep(seq(n),k),num=unlist(sapply(k,seq)))
d2 <- d[sample(nrow(d)),]
dby(d2, y~id, mean)
#> y x id num mean
#> 2 -0.2774354 1.0222523 1 2 -1.2812160
#> 3 -3.2679895 -0.4357077 1 3 -1.2812160
#> 1 -0.2982232 0.4953284 1 1 -1.2812160
#> 4 2.2863256 2.5695189 2 1 0.6687838
#> 6 -0.8935153 0.7784325 2 3 0.6687838
#> 5 -0.7890313 0.6645009 2 2 0.6687838
#> 7 2.0713562 -0.8995125 2 4 0.6687838
#> 9 -0.6849653 -1.0908300 3 2 -0.1056147
#> 8 0.4737359 -1.5351550 3 1 -0.1056147
#> 11 0.2457815 -1.9814271 4 2 -0.4985698
#> 10 -1.2429211 -0.1138345 4 1 -0.4985698
dby(d2, y~id + order(num), cumsum)
#> y x id num cumsum
#> 1 -0.2982232 0.4953284 1 1 -0.2982232
#> 2 -0.2774354 1.0222523 1 2 -0.5756586
#> 3 -3.2679895 -0.4357077 1 3 -3.8436481
#> 4 2.2863256 2.5695189 2 1 2.2863256
#> 5 -0.7890313 0.6645009 2 2 1.4972944
#> 6 -0.8935153 0.7784325 2 3 0.6037790
#> 7 2.0713562 -0.8995125 2 4 2.6751353
#> 8 0.4737359 -1.5351550 3 1 0.4737359
#> 9 -0.6849653 -1.0908300 3 2 -0.2112293
#> 10 -1.2429211 -0.1138345 4 1 -1.2429211
#> 11 0.2457815 -1.9814271 4 2 -0.9971395
dby(d,y ~ id + order(num), dlag)
#> y x id num dlag
#> 1 -0.2982232 0.4953284 1 1 NA
#> 2 -0.2774354 1.0222523 1 2 -0.2982232
#> 3 -3.2679895 -0.4357077 1 3 -0.2774354
#> 4 2.2863256 2.5695189 2 1 NA
#> 5 -0.7890313 0.6645009 2 2 2.2863256
#> 6 -0.8935153 0.7784325 2 3 -0.7890313
#> 7 2.0713562 -0.8995125 2 4 -0.8935153
#> 8 0.4737359 -1.5351550 3 1 NA
#> 9 -0.6849653 -1.0908300 3 2 0.4737359
#> 10 -1.2429211 -0.1138345 4 1 NA
#> 11 0.2457815 -1.9814271 4 2 -1.2429211
dby(d,y ~ id + order(num), dlag, ARGS=list(k=1:2))
#> y x id num dlag1 dlag2
#> 1 -0.2982232 0.4953284 1 1 NA NA
#> 2 -0.2774354 1.0222523 1 2 -0.2982232 NA
#> 3 -3.2679895 -0.4357077 1 3 -0.2774354 -0.2982232
#> 4 2.2863256 2.5695189 2 1 NA NA
#> 5 -0.7890313 0.6645009 2 2 2.2863256 NA
#> 6 -0.8935153 0.7784325 2 3 -0.7890313 2.2863256
#> 7 2.0713562 -0.8995125 2 4 -0.8935153 -0.7890313
#> 8 0.4737359 -1.5351550 3 1 NA NA
#> 9 -0.6849653 -1.0908300 3 2 0.4737359 NA
#> 10 -1.2429211 -0.1138345 4 1 NA NA
#> 11 0.2457815 -1.9814271 4 2 -1.2429211 NA
dby(d,y ~ id + order(num), dlag, ARGS=list(k=1:2), NAMES=c("l1","l2"))
#> y x id num l1 l2
#> 1 -0.2982232 0.4953284 1 1 NA NA
#> 2 -0.2774354 1.0222523 1 2 -0.2982232 NA
#> 3 -3.2679895 -0.4357077 1 3 -0.2774354 -0.2982232
#> 4 2.2863256 2.5695189 2 1 NA NA
#> 5 -0.7890313 0.6645009 2 2 2.2863256 NA
#> 6 -0.8935153 0.7784325 2 3 -0.7890313 2.2863256
#> 7 2.0713562 -0.8995125 2 4 -0.8935153 -0.7890313
#> 8 0.4737359 -1.5351550 3 1 NA NA
#> 9 -0.6849653 -1.0908300 3 2 0.4737359 NA
#> 10 -1.2429211 -0.1138345 4 1 NA NA
#> 11 0.2457815 -1.9814271 4 2 -1.2429211 NA
dby(d, y~id + order(num), mean=mean, csum=cumsum, n=length)
#> y x id num mean csum n
#> 1 -0.2982232 0.4953284 1 1 -1.2812160 -0.2982232 3
#> 2 -0.2774354 1.0222523 1 2 -1.2812160 -0.5756586 3
#> 3 -3.2679895 -0.4357077 1 3 -1.2812160 -3.8436481 3
#> 4 2.2863256 2.5695189 2 1 0.6687838 2.2863256 4
#> 5 -0.7890313 0.6645009 2 2 0.6687838 1.4972944 4
#> 6 -0.8935153 0.7784325 2 3 0.6687838 0.6037790 4
#> 7 2.0713562 -0.8995125 2 4 0.6687838 2.6751353 4
#> 8 0.4737359 -1.5351550 3 1 -0.1056147 0.4737359 2
#> 9 -0.6849653 -1.0908300 3 2 -0.1056147 -0.2112293 2
#> 10 -1.2429211 -0.1138345 4 1 -0.4985698 -1.2429211 2
#> 11 0.2457815 -1.9814271 4 2 -0.4985698 -0.9971395 2
dby(d2, y~id + order(num), a=cumsum, b=mean, N=length, l1=function(x) c(NA,x)[-length(x)])
#> y x id num a b N l1
#> 1 -0.2982232 0.4953284 1 1 -0.2982232 -1.2812160 3 NA
#> 2 -0.2774354 1.0222523 1 2 -0.5756586 -1.2812160 3 -0.2982232
#> 3 -3.2679895 -0.4357077 1 3 -3.8436481 -1.2812160 3 -3.2679895
#> 4 2.2863256 2.5695189 2 1 2.2863256 0.6687838 4 NA
#> 5 -0.7890313 0.6645009 2 2 1.4972944 0.6687838 4 2.2863256
#> 6 -0.8935153 0.7784325 2 3 0.6037790 0.6687838 4 -0.7890313
#> 7 2.0713562 -0.8995125 2 4 2.6751353 0.6687838 4 2.0713562
#> 8 0.4737359 -1.5351550 3 1 0.4737359 -0.1056147 2 NA
#> 9 -0.6849653 -1.0908300 3 2 -0.2112293 -0.1056147 2 -0.6849653
#> 10 -1.2429211 -0.1138345 4 1 -1.2429211 -0.4985698 2 NA
#> 11 0.2457815 -1.9814271 4 2 -0.9971395 -0.4985698 2 0.2457815
dby(d, y~id + order(num), nn=seq_along, n=length)
#> y x id num nn n
#> 1 -0.2982232 0.4953284 1 1 1 3
#> 2 -0.2774354 1.0222523 1 2 2 3
#> 3 -3.2679895 -0.4357077 1 3 3 3
#> 4 2.2863256 2.5695189 2 1 1 4
#> 5 -0.7890313 0.6645009 2 2 2 4
#> 6 -0.8935153 0.7784325 2 3 3 4
#> 7 2.0713562 -0.8995125 2 4 4 4
#> 8 0.4737359 -1.5351550 3 1 1 2
#> 9 -0.6849653 -1.0908300 3 2 2 2
#> 10 -1.2429211 -0.1138345 4 1 1 2
#> 11 0.2457815 -1.9814271 4 2 2 2
dby(d, y~id + order(num), nn=seq_along, n=length)
#> y x id num nn n
#> 1 -0.2982232 0.4953284 1 1 1 3
#> 2 -0.2774354 1.0222523 1 2 2 3
#> 3 -3.2679895 -0.4357077 1 3 3 3
#> 4 2.2863256 2.5695189 2 1 1 4
#> 5 -0.7890313 0.6645009 2 2 2 4
#> 6 -0.8935153 0.7784325 2 3 3 4
#> 7 2.0713562 -0.8995125 2 4 4 4
#> 8 0.4737359 -1.5351550 3 1 1 2
#> 9 -0.6849653 -1.0908300 3 2 2 2
#> 10 -1.2429211 -0.1138345 4 1 1 2
#> 11 0.2457815 -1.9814271 4 2 2 2
d <- d[,1:4]
dby(d, x<0) <- list(z=mean)
d <- dby(d, is.na(z), z=1)
f <- function(x) apply(x,1,min)
dby(d, y+x~id, min=f)
#> Error: object 'f' not found
dby(d,y+x~id+order(num), function(x) x)
#> y x id num z _11 _12
#> 1 -0.2982232 0.4953284 1 1 1.0000000 -0.2982232 0.4953284
#> 2 -0.2774354 1.0222523 1 2 1.0000000 -0.2774354 1.0222523
#> 3 -3.2679895 -0.4357077 1 3 0.8974388 -3.2679895 -0.4357077
#> 4 2.2863256 2.5695189 2 1 1.0000000 2.2863256 2.5695189
#> 5 -0.7890313 0.6645009 2 2 1.0000000 -0.7890313 0.6645009
#> 6 -0.8935153 0.7784325 2 3 1.0000000 -0.8935153 0.7784325
#> 7 2.0713562 -0.8995125 2 4 0.8974388 2.0713562 -0.8995125
#> 8 0.4737359 -1.5351550 3 1 0.8974388 0.4737359 -1.5351550
#> 9 -0.6849653 -1.0908300 3 2 0.8974388 -0.6849653 -1.0908300
#> 10 -1.2429211 -0.1138345 4 1 0.8974388 -1.2429211 -0.1138345
#> 11 0.2457815 -1.9814271 4 2 0.8974388 0.2457815 -1.9814271
f <- function(x) { cbind(cumsum(x[,1]),cumsum(x[,2]))/sum(x)}
dby(d, y+x~id, f)
#> Error: object 'f' not found
## column-wise
a <- d
dby2(a, mean, median, REGEX=TRUE) <- '^[y|x]'~id
a
#> y x id num z mean.y mean.x median.y
#> 1 -0.2982232 0.4953284 1 1 1.0000000 -1.2812160 0.3606243 -0.2982232
#> 2 -0.2774354 1.0222523 1 2 1.0000000 -1.2812160 0.3606243 -0.2982232
#> 3 -3.2679895 -0.4357077 1 3 0.8974388 -1.2812160 0.3606243 -0.2982232
#> 4 2.2863256 2.5695189 2 1 1.0000000 0.6687838 0.7782349 0.6411625
#> 5 -0.7890313 0.6645009 2 2 1.0000000 0.6687838 0.7782349 0.6411625
#> 6 -0.8935153 0.7784325 2 3 1.0000000 0.6687838 0.7782349 0.6411625
#> 7 2.0713562 -0.8995125 2 4 0.8974388 0.6687838 0.7782349 0.6411625
#> 8 0.4737359 -1.5351550 3 1 0.8974388 -0.1056147 -1.3129925 -0.1056147
#> 9 -0.6849653 -1.0908300 3 2 0.8974388 -0.1056147 -1.3129925 -0.1056147
#> 10 -1.2429211 -0.1138345 4 1 0.8974388 -0.4985698 -1.0476308 -0.4985698
#> 11 0.2457815 -1.9814271 4 2 0.8974388 -0.4985698 -1.0476308 -0.4985698
#> median.x
#> 1 0.4953284
#> 2 0.4953284
#> 3 0.4953284
#> 4 0.7214667
#> 5 0.7214667
#> 6 0.7214667
#> 7 0.7214667
#> 8 -1.3129925
#> 9 -1.3129925
#> 10 -1.0476308
#> 11 -1.0476308
## wildcards
dby2(a,'y*'+'x*'~id,mean)
#> y x id num z median.y median.x mean.y
#> 1 -0.2982232 0.4953284 1 1 1.0000000 -0.2982232 0.4953284 -1.2812160
#> 2 -0.2774354 1.0222523 1 2 1.0000000 -0.2982232 0.4953284 -1.2812160
#> 3 -3.2679895 -0.4357077 1 3 0.8974388 -0.2982232 0.4953284 -1.2812160
#> 4 2.2863256 2.5695189 2 1 1.0000000 0.6411625 0.7214667 0.6687838
#> 5 -0.7890313 0.6645009 2 2 1.0000000 0.6411625 0.7214667 0.6687838
#> 6 -0.8935153 0.7784325 2 3 1.0000000 0.6411625 0.7214667 0.6687838
#> 7 2.0713562 -0.8995125 2 4 0.8974388 0.6411625 0.7214667 0.6687838
#> 8 0.4737359 -1.5351550 3 1 0.8974388 -0.1056147 -1.3129925 -0.1056147
#> 9 -0.6849653 -1.0908300 3 2 0.8974388 -0.1056147 -1.3129925 -0.1056147
#> 10 -1.2429211 -0.1138345 4 1 0.8974388 -0.4985698 -1.0476308 -0.4985698
#> 11 0.2457815 -1.9814271 4 2 0.8974388 -0.4985698 -1.0476308 -0.4985698
#> mean.x
#> 1 0.3606243
#> 2 0.3606243
#> 3 0.3606243
#> 4 0.7782349
#> 5 0.7782349
#> 6 0.7782349
#> 7 0.7782349
#> 8 -1.3129925
#> 9 -1.3129925
#> 10 -1.0476308
#> 11 -1.0476308
## subset
dby(d, x<0) <- list(z=NA)
d
#> y x id num z
#> 1 -0.2982232 0.4953284 1 1 1
#> 2 -0.2774354 1.0222523 1 2 1
#> 3 -3.2679895 -0.4357077 1 3 NA
#> 4 2.2863256 2.5695189 2 1 1
#> 5 -0.7890313 0.6645009 2 2 1
#> 6 -0.8935153 0.7784325 2 3 1
#> 7 2.0713562 -0.8995125 2 4 NA
#> 8 0.4737359 -1.5351550 3 1 NA
#> 9 -0.6849653 -1.0908300 3 2 NA
#> 10 -1.2429211 -0.1138345 4 1 NA
#> 11 0.2457815 -1.9814271 4 2 NA
dby(d, y~id|x>-1, v=mean,z=1)
#> y x id num v z
#> 1 -0.2982232 0.4953284 1 1 -1.2812160 1
#> 2 -0.2774354 1.0222523 1 2 -1.2812160 1
#> 3 -3.2679895 -0.4357077 1 3 -1.2812160 1
#> 4 2.2863256 2.5695189 2 1 0.6687838 1
#> 5 -0.7890313 0.6645009 2 2 0.6687838 1
#> 6 -0.8935153 0.7784325 2 3 0.6687838 1
#> 7 2.0713562 -0.8995125 2 4 0.6687838 1
#> 8 0.4737359 -1.5351550 3 1 NA NA
#> 9 -0.6849653 -1.0908300 3 2 NA NA
#> 10 -1.2429211 -0.1138345 4 1 -1.2429211 1
#> 11 0.2457815 -1.9814271 4 2 NA NA
dby(d, y+x~id|x>-1, mean, median, COLUMN=TRUE)
#> y x id num z mean.y mean.x median.y median.x
#> 1 -0.2982232 0.4953284 1 1 1 -1.2812160 0.3606243 -0.2982232 0.4953284
#> 2 -0.2774354 1.0222523 1 2 1 -1.2812160 0.3606243 -0.2982232 0.4953284
#> 3 -3.2679895 -0.4357077 1 3 NA -1.2812160 0.3606243 -0.2982232 0.4953284
#> 4 2.2863256 2.5695189 2 1 1 0.6687838 0.7782349 0.6411625 0.7214667
#> 5 -0.7890313 0.6645009 2 2 1 0.6687838 0.7782349 0.6411625 0.7214667
#> 6 -0.8935153 0.7784325 2 3 1 0.6687838 0.7782349 0.6411625 0.7214667
#> 7 2.0713562 -0.8995125 2 4 NA 0.6687838 0.7782349 0.6411625 0.7214667
#> 8 0.4737359 -1.5351550 3 1 NA NA NA NA NA
#> 9 -0.6849653 -1.0908300 3 2 NA NA NA NA NA
#> 10 -1.2429211 -0.1138345 4 1 NA -1.2429211 -0.1138345 -1.2429211 -0.1138345
#> 11 0.2457815 -1.9814271 4 2 NA NA NA NA NA
dby2(d, y+x~id|x>0, mean, REDUCE=TRUE)
#> id mean.y mean.x
#> 1 1 -0.2878293 0.7587903
#> 2 2 0.2012597 1.3374841
dby(d,y~id|x<0,mean,ALL=FALSE)
#> y x id num z mean
#> 3 -3.2679895 -0.4357077 1 3 NA -3.2679895
#> 7 2.0713562 -0.8995125 2 4 NA 2.0713562
#> 8 0.4737359 -1.5351550 3 1 NA -0.1056147
#> 9 -0.6849653 -1.0908300 3 2 NA -0.1056147
#> 10 -1.2429211 -0.1138345 4 1 NA -0.4985698
#> 11 0.2457815 -1.9814271 4 2 NA -0.4985698
a <- iris
a <- dby(a,y=1)
dby(a,Species=="versicolor") <- list(y=2)