Calculate summary statistics grouped by variable

dby(
  data,
  INPUT,
  ...,
  ID = NULL,
  ORDER = NULL,
  SUBSET = NULL,
  SORT = 0,
  COMBINE = !REDUCE,
  NOCHECK = FALSE,
  ARGS = NULL,
  NAMES,
  COLUMN = FALSE,
  REDUCE = FALSE,
  REGEX = mets.options()$regex,
  ALL = TRUE
)

Arguments

data

Data.frame

INPUT

Input variables (character or formula)

...

functions

ID

id variable

ORDER

(optional) order variable

SUBSET

(optional) subset expression

SORT

sort order (id+order variable)

COMBINE

If TRUE result is appended to data

NOCHECK

No sorting or check for missing data

ARGS

Optional list of arguments to functions (...)

NAMES

Optional vector of column names

COLUMN

If TRUE do the calculations for each column

REDUCE

Reduce number of redundant rows

REGEX

Allow regular expressions

ALL

if FALSE only the subset will be returned

Details

Calculate summary statistics grouped by

dby2 for column-wise calculations

Author

Klaus K. Holst and Thomas Scheike

Examples

n <- 4
k <- c(3,rbinom(n-1,3,0.5)+1)
N <- sum(k)
d <- data.frame(y=rnorm(N),x=rnorm(N),id=rep(seq(n),k),num=unlist(sapply(k,seq)))
d2 <- d[sample(nrow(d)),]

dby(d2, y~id, mean)
#>             y          x id num       mean
#> 2  -0.2774354  1.0222523  1   2 -1.2812160
#> 3  -3.2679895 -0.4357077  1   3 -1.2812160
#> 1  -0.2982232  0.4953284  1   1 -1.2812160
#> 4   2.2863256  2.5695189  2   1  0.6687838
#> 6  -0.8935153  0.7784325  2   3  0.6687838
#> 5  -0.7890313  0.6645009  2   2  0.6687838
#> 7   2.0713562 -0.8995125  2   4  0.6687838
#> 9  -0.6849653 -1.0908300  3   2 -0.1056147
#> 8   0.4737359 -1.5351550  3   1 -0.1056147
#> 11  0.2457815 -1.9814271  4   2 -0.4985698
#> 10 -1.2429211 -0.1138345  4   1 -0.4985698
dby(d2, y~id + order(num), cumsum)
#>             y          x id num     cumsum
#> 1  -0.2982232  0.4953284  1   1 -0.2982232
#> 2  -0.2774354  1.0222523  1   2 -0.5756586
#> 3  -3.2679895 -0.4357077  1   3 -3.8436481
#> 4   2.2863256  2.5695189  2   1  2.2863256
#> 5  -0.7890313  0.6645009  2   2  1.4972944
#> 6  -0.8935153  0.7784325  2   3  0.6037790
#> 7   2.0713562 -0.8995125  2   4  2.6751353
#> 8   0.4737359 -1.5351550  3   1  0.4737359
#> 9  -0.6849653 -1.0908300  3   2 -0.2112293
#> 10 -1.2429211 -0.1138345  4   1 -1.2429211
#> 11  0.2457815 -1.9814271  4   2 -0.9971395

dby(d,y ~ id + order(num), dlag)
#>             y          x id num       dlag
#> 1  -0.2982232  0.4953284  1   1         NA
#> 2  -0.2774354  1.0222523  1   2 -0.2982232
#> 3  -3.2679895 -0.4357077  1   3 -0.2774354
#> 4   2.2863256  2.5695189  2   1         NA
#> 5  -0.7890313  0.6645009  2   2  2.2863256
#> 6  -0.8935153  0.7784325  2   3 -0.7890313
#> 7   2.0713562 -0.8995125  2   4 -0.8935153
#> 8   0.4737359 -1.5351550  3   1         NA
#> 9  -0.6849653 -1.0908300  3   2  0.4737359
#> 10 -1.2429211 -0.1138345  4   1         NA
#> 11  0.2457815 -1.9814271  4   2 -1.2429211
dby(d,y ~ id + order(num), dlag, ARGS=list(k=1:2))
#>             y          x id num      dlag1      dlag2
#> 1  -0.2982232  0.4953284  1   1         NA         NA
#> 2  -0.2774354  1.0222523  1   2 -0.2982232         NA
#> 3  -3.2679895 -0.4357077  1   3 -0.2774354 -0.2982232
#> 4   2.2863256  2.5695189  2   1         NA         NA
#> 5  -0.7890313  0.6645009  2   2  2.2863256         NA
#> 6  -0.8935153  0.7784325  2   3 -0.7890313  2.2863256
#> 7   2.0713562 -0.8995125  2   4 -0.8935153 -0.7890313
#> 8   0.4737359 -1.5351550  3   1         NA         NA
#> 9  -0.6849653 -1.0908300  3   2  0.4737359         NA
#> 10 -1.2429211 -0.1138345  4   1         NA         NA
#> 11  0.2457815 -1.9814271  4   2 -1.2429211         NA
dby(d,y ~ id + order(num), dlag, ARGS=list(k=1:2), NAMES=c("l1","l2"))
#>             y          x id num         l1         l2
#> 1  -0.2982232  0.4953284  1   1         NA         NA
#> 2  -0.2774354  1.0222523  1   2 -0.2982232         NA
#> 3  -3.2679895 -0.4357077  1   3 -0.2774354 -0.2982232
#> 4   2.2863256  2.5695189  2   1         NA         NA
#> 5  -0.7890313  0.6645009  2   2  2.2863256         NA
#> 6  -0.8935153  0.7784325  2   3 -0.7890313  2.2863256
#> 7   2.0713562 -0.8995125  2   4 -0.8935153 -0.7890313
#> 8   0.4737359 -1.5351550  3   1         NA         NA
#> 9  -0.6849653 -1.0908300  3   2  0.4737359         NA
#> 10 -1.2429211 -0.1138345  4   1         NA         NA
#> 11  0.2457815 -1.9814271  4   2 -1.2429211         NA

dby(d, y~id + order(num), mean=mean, csum=cumsum, n=length)
#>             y          x id num       mean       csum n
#> 1  -0.2982232  0.4953284  1   1 -1.2812160 -0.2982232 3
#> 2  -0.2774354  1.0222523  1   2 -1.2812160 -0.5756586 3
#> 3  -3.2679895 -0.4357077  1   3 -1.2812160 -3.8436481 3
#> 4   2.2863256  2.5695189  2   1  0.6687838  2.2863256 4
#> 5  -0.7890313  0.6645009  2   2  0.6687838  1.4972944 4
#> 6  -0.8935153  0.7784325  2   3  0.6687838  0.6037790 4
#> 7   2.0713562 -0.8995125  2   4  0.6687838  2.6751353 4
#> 8   0.4737359 -1.5351550  3   1 -0.1056147  0.4737359 2
#> 9  -0.6849653 -1.0908300  3   2 -0.1056147 -0.2112293 2
#> 10 -1.2429211 -0.1138345  4   1 -0.4985698 -1.2429211 2
#> 11  0.2457815 -1.9814271  4   2 -0.4985698 -0.9971395 2
dby(d2, y~id + order(num), a=cumsum, b=mean, N=length, l1=function(x) c(NA,x)[-length(x)])
#>             y          x id num          a          b N         l1
#> 1  -0.2982232  0.4953284  1   1 -0.2982232 -1.2812160 3         NA
#> 2  -0.2774354  1.0222523  1   2 -0.5756586 -1.2812160 3 -0.2982232
#> 3  -3.2679895 -0.4357077  1   3 -3.8436481 -1.2812160 3 -3.2679895
#> 4   2.2863256  2.5695189  2   1  2.2863256  0.6687838 4         NA
#> 5  -0.7890313  0.6645009  2   2  1.4972944  0.6687838 4  2.2863256
#> 6  -0.8935153  0.7784325  2   3  0.6037790  0.6687838 4 -0.7890313
#> 7   2.0713562 -0.8995125  2   4  2.6751353  0.6687838 4  2.0713562
#> 8   0.4737359 -1.5351550  3   1  0.4737359 -0.1056147 2         NA
#> 9  -0.6849653 -1.0908300  3   2 -0.2112293 -0.1056147 2 -0.6849653
#> 10 -1.2429211 -0.1138345  4   1 -1.2429211 -0.4985698 2         NA
#> 11  0.2457815 -1.9814271  4   2 -0.9971395 -0.4985698 2  0.2457815

dby(d, y~id + order(num), nn=seq_along, n=length)
#>             y          x id num nn n
#> 1  -0.2982232  0.4953284  1   1  1 3
#> 2  -0.2774354  1.0222523  1   2  2 3
#> 3  -3.2679895 -0.4357077  1   3  3 3
#> 4   2.2863256  2.5695189  2   1  1 4
#> 5  -0.7890313  0.6645009  2   2  2 4
#> 6  -0.8935153  0.7784325  2   3  3 4
#> 7   2.0713562 -0.8995125  2   4  4 4
#> 8   0.4737359 -1.5351550  3   1  1 2
#> 9  -0.6849653 -1.0908300  3   2  2 2
#> 10 -1.2429211 -0.1138345  4   1  1 2
#> 11  0.2457815 -1.9814271  4   2  2 2
dby(d, y~id + order(num), nn=seq_along, n=length)
#>             y          x id num nn n
#> 1  -0.2982232  0.4953284  1   1  1 3
#> 2  -0.2774354  1.0222523  1   2  2 3
#> 3  -3.2679895 -0.4357077  1   3  3 3
#> 4   2.2863256  2.5695189  2   1  1 4
#> 5  -0.7890313  0.6645009  2   2  2 4
#> 6  -0.8935153  0.7784325  2   3  3 4
#> 7   2.0713562 -0.8995125  2   4  4 4
#> 8   0.4737359 -1.5351550  3   1  1 2
#> 9  -0.6849653 -1.0908300  3   2  2 2
#> 10 -1.2429211 -0.1138345  4   1  1 2
#> 11  0.2457815 -1.9814271  4   2  2 2

d <- d[,1:4]
dby(d, x<0) <- list(z=mean)
d <- dby(d, is.na(z), z=1)

f <- function(x) apply(x,1,min)
dby(d, y+x~id, min=f)
#> Error: object 'f' not found

dby(d,y+x~id+order(num), function(x) x)
#>             y          x id num         z        _11        _12
#> 1  -0.2982232  0.4953284  1   1 1.0000000 -0.2982232  0.4953284
#> 2  -0.2774354  1.0222523  1   2 1.0000000 -0.2774354  1.0222523
#> 3  -3.2679895 -0.4357077  1   3 0.8974388 -3.2679895 -0.4357077
#> 4   2.2863256  2.5695189  2   1 1.0000000  2.2863256  2.5695189
#> 5  -0.7890313  0.6645009  2   2 1.0000000 -0.7890313  0.6645009
#> 6  -0.8935153  0.7784325  2   3 1.0000000 -0.8935153  0.7784325
#> 7   2.0713562 -0.8995125  2   4 0.8974388  2.0713562 -0.8995125
#> 8   0.4737359 -1.5351550  3   1 0.8974388  0.4737359 -1.5351550
#> 9  -0.6849653 -1.0908300  3   2 0.8974388 -0.6849653 -1.0908300
#> 10 -1.2429211 -0.1138345  4   1 0.8974388 -1.2429211 -0.1138345
#> 11  0.2457815 -1.9814271  4   2 0.8974388  0.2457815 -1.9814271

f <- function(x) { cbind(cumsum(x[,1]),cumsum(x[,2]))/sum(x)}
dby(d, y+x~id, f)
#> Error: object 'f' not found

## column-wise
a <- d
dby2(a, mean, median, REGEX=TRUE) <- '^[y|x]'~id
a
#>             y          x id num         z     mean.y     mean.x   median.y
#> 1  -0.2982232  0.4953284  1   1 1.0000000 -1.2812160  0.3606243 -0.2982232
#> 2  -0.2774354  1.0222523  1   2 1.0000000 -1.2812160  0.3606243 -0.2982232
#> 3  -3.2679895 -0.4357077  1   3 0.8974388 -1.2812160  0.3606243 -0.2982232
#> 4   2.2863256  2.5695189  2   1 1.0000000  0.6687838  0.7782349  0.6411625
#> 5  -0.7890313  0.6645009  2   2 1.0000000  0.6687838  0.7782349  0.6411625
#> 6  -0.8935153  0.7784325  2   3 1.0000000  0.6687838  0.7782349  0.6411625
#> 7   2.0713562 -0.8995125  2   4 0.8974388  0.6687838  0.7782349  0.6411625
#> 8   0.4737359 -1.5351550  3   1 0.8974388 -0.1056147 -1.3129925 -0.1056147
#> 9  -0.6849653 -1.0908300  3   2 0.8974388 -0.1056147 -1.3129925 -0.1056147
#> 10 -1.2429211 -0.1138345  4   1 0.8974388 -0.4985698 -1.0476308 -0.4985698
#> 11  0.2457815 -1.9814271  4   2 0.8974388 -0.4985698 -1.0476308 -0.4985698
#>      median.x
#> 1   0.4953284
#> 2   0.4953284
#> 3   0.4953284
#> 4   0.7214667
#> 5   0.7214667
#> 6   0.7214667
#> 7   0.7214667
#> 8  -1.3129925
#> 9  -1.3129925
#> 10 -1.0476308
#> 11 -1.0476308
## wildcards 
dby2(a,'y*'+'x*'~id,mean) 
#>             y          x id num         z   median.y   median.x     mean.y
#> 1  -0.2982232  0.4953284  1   1 1.0000000 -0.2982232  0.4953284 -1.2812160
#> 2  -0.2774354  1.0222523  1   2 1.0000000 -0.2982232  0.4953284 -1.2812160
#> 3  -3.2679895 -0.4357077  1   3 0.8974388 -0.2982232  0.4953284 -1.2812160
#> 4   2.2863256  2.5695189  2   1 1.0000000  0.6411625  0.7214667  0.6687838
#> 5  -0.7890313  0.6645009  2   2 1.0000000  0.6411625  0.7214667  0.6687838
#> 6  -0.8935153  0.7784325  2   3 1.0000000  0.6411625  0.7214667  0.6687838
#> 7   2.0713562 -0.8995125  2   4 0.8974388  0.6411625  0.7214667  0.6687838
#> 8   0.4737359 -1.5351550  3   1 0.8974388 -0.1056147 -1.3129925 -0.1056147
#> 9  -0.6849653 -1.0908300  3   2 0.8974388 -0.1056147 -1.3129925 -0.1056147
#> 10 -1.2429211 -0.1138345  4   1 0.8974388 -0.4985698 -1.0476308 -0.4985698
#> 11  0.2457815 -1.9814271  4   2 0.8974388 -0.4985698 -1.0476308 -0.4985698
#>        mean.x
#> 1   0.3606243
#> 2   0.3606243
#> 3   0.3606243
#> 4   0.7782349
#> 5   0.7782349
#> 6   0.7782349
#> 7   0.7782349
#> 8  -1.3129925
#> 9  -1.3129925
#> 10 -1.0476308
#> 11 -1.0476308


## subset
dby(d, x<0) <- list(z=NA)
d
#>             y          x id num  z
#> 1  -0.2982232  0.4953284  1   1  1
#> 2  -0.2774354  1.0222523  1   2  1
#> 3  -3.2679895 -0.4357077  1   3 NA
#> 4   2.2863256  2.5695189  2   1  1
#> 5  -0.7890313  0.6645009  2   2  1
#> 6  -0.8935153  0.7784325  2   3  1
#> 7   2.0713562 -0.8995125  2   4 NA
#> 8   0.4737359 -1.5351550  3   1 NA
#> 9  -0.6849653 -1.0908300  3   2 NA
#> 10 -1.2429211 -0.1138345  4   1 NA
#> 11  0.2457815 -1.9814271  4   2 NA
dby(d, y~id|x>-1, v=mean,z=1)
#>             y          x id num          v  z
#> 1  -0.2982232  0.4953284  1   1 -1.2812160  1
#> 2  -0.2774354  1.0222523  1   2 -1.2812160  1
#> 3  -3.2679895 -0.4357077  1   3 -1.2812160  1
#> 4   2.2863256  2.5695189  2   1  0.6687838  1
#> 5  -0.7890313  0.6645009  2   2  0.6687838  1
#> 6  -0.8935153  0.7784325  2   3  0.6687838  1
#> 7   2.0713562 -0.8995125  2   4  0.6687838  1
#> 8   0.4737359 -1.5351550  3   1         NA NA
#> 9  -0.6849653 -1.0908300  3   2         NA NA
#> 10 -1.2429211 -0.1138345  4   1 -1.2429211  1
#> 11  0.2457815 -1.9814271  4   2         NA NA
dby(d, y+x~id|x>-1, mean, median, COLUMN=TRUE)
#>             y          x id num  z     mean.y     mean.x   median.y   median.x
#> 1  -0.2982232  0.4953284  1   1  1 -1.2812160  0.3606243 -0.2982232  0.4953284
#> 2  -0.2774354  1.0222523  1   2  1 -1.2812160  0.3606243 -0.2982232  0.4953284
#> 3  -3.2679895 -0.4357077  1   3 NA -1.2812160  0.3606243 -0.2982232  0.4953284
#> 4   2.2863256  2.5695189  2   1  1  0.6687838  0.7782349  0.6411625  0.7214667
#> 5  -0.7890313  0.6645009  2   2  1  0.6687838  0.7782349  0.6411625  0.7214667
#> 6  -0.8935153  0.7784325  2   3  1  0.6687838  0.7782349  0.6411625  0.7214667
#> 7   2.0713562 -0.8995125  2   4 NA  0.6687838  0.7782349  0.6411625  0.7214667
#> 8   0.4737359 -1.5351550  3   1 NA         NA         NA         NA         NA
#> 9  -0.6849653 -1.0908300  3   2 NA         NA         NA         NA         NA
#> 10 -1.2429211 -0.1138345  4   1 NA -1.2429211 -0.1138345 -1.2429211 -0.1138345
#> 11  0.2457815 -1.9814271  4   2 NA         NA         NA         NA         NA

dby2(d, y+x~id|x>0, mean, REDUCE=TRUE)
#>   id     mean.y    mean.x
#> 1  1 -0.2878293 0.7587903
#> 2  2  0.2012597 1.3374841

dby(d,y~id|x<0,mean,ALL=FALSE)
#>             y          x id num  z       mean
#> 3  -3.2679895 -0.4357077  1   3 NA -3.2679895
#> 7   2.0713562 -0.8995125  2   4 NA  2.0713562
#> 8   0.4737359 -1.5351550  3   1 NA -0.1056147
#> 9  -0.6849653 -1.0908300  3   2 NA -0.1056147
#> 10 -1.2429211 -0.1138345  4   1 NA -0.4985698
#> 11  0.2457815 -1.9814271  4   2 NA -0.4985698

a <- iris
a <- dby(a,y=1)
dby(a,Species=="versicolor") <- list(y=2)