Calculate summary statistics grouped by variable

dby(
  data,
  INPUT,
  ...,
  ID = NULL,
  ORDER = NULL,
  SUBSET = NULL,
  SORT = 0,
  COMBINE = !REDUCE,
  NOCHECK = FALSE,
  ARGS = NULL,
  NAMES,
  COLUMN = FALSE,
  REDUCE = FALSE,
  REGEX = mets.options()$regex,
  ALL = TRUE
)

Arguments

data

Data.frame

INPUT

Input variables (character or formula)

...

functions

ID

id variable

ORDER

(optional) order variable

SUBSET

(optional) subset expression

SORT

sort order (id+order variable)

COMBINE

If TRUE result is appended to data

NOCHECK

No sorting or check for missing data

ARGS

Optional list of arguments to functions (...)

NAMES

Optional vector of column names

COLUMN

If TRUE do the calculations for each column

REDUCE

Reduce number of redundant rows

REGEX

Allow regular expressions

ALL

if FALSE only the subset will be returned

Details

Calculate summary statistics grouped by

dby2 for column-wise calculations

Author

Klaus K. Holst and Thomas Scheike

Examples

n <- 4
k <- c(3,rbinom(n-1,3,0.5)+1)
N <- sum(k)
d <- data.frame(y=rnorm(N),x=rnorm(N),
   id=rep(seq(n),k),num=unlist(sapply(k,seq))
)
d2 <- d[sample(nrow(d)),]

dby(d2, y~id, mean)
#>              y          x id num       mean
#> 1  -0.65666950  0.3064051  1   1  0.5684420
#> 3   2.24915454 -1.1170657  1   3  0.5684420
#> 2   0.11284083  0.0211384  1   2  0.5684420
#> 6   0.58410126 -0.1862686  2   3 -0.2496999
#> 5  -0.35545945 -0.7846569  2   2 -0.2496999
#> 4  -0.97774138 -1.5214842  2   1 -0.2496999
#> 7   0.05404795 -0.3898105  3   1 -0.5529647
#> 8  -1.15997739 -1.1005508  3   2 -0.5529647
#> 10 -1.12149743 -0.6417276  4   2 -0.6258272
#> 11 -0.18221248  0.9872356  4   3 -0.6258272
#> 9  -0.57377160  0.5216237  4   1 -0.6258272
dby(d2, y~id + order(num), cumsum)
#>              y          x id num      cumsum
#> 1  -0.65666950  0.3064051  1   1 -0.65666950
#> 2   0.11284083  0.0211384  1   2 -0.54382867
#> 3   2.24915454 -1.1170657  1   3  1.70532587
#> 4  -0.97774138 -1.5214842  2   1 -0.97774138
#> 5  -0.35545945 -0.7846569  2   2 -1.33320083
#> 6   0.58410126 -0.1862686  2   3 -0.74909958
#> 7   0.05404795 -0.3898105  3   1  0.05404795
#> 8  -1.15997739 -1.1005508  3   2 -1.10592944
#> 9  -0.57377160  0.5216237  4   1 -0.57377160
#> 10 -1.12149743 -0.6417276  4   2 -1.69526903
#> 11 -0.18221248  0.9872356  4   3 -1.87748151

dby(d,y ~ id + order(num), dlag)
#>              y          x id num        dlag
#> 1  -0.65666950  0.3064051  1   1          NA
#> 2   0.11284083  0.0211384  1   2 -0.65666950
#> 3   2.24915454 -1.1170657  1   3  0.11284083
#> 4  -0.97774138 -1.5214842  2   1          NA
#> 5  -0.35545945 -0.7846569  2   2 -0.97774138
#> 6   0.58410126 -0.1862686  2   3 -0.35545945
#> 7   0.05404795 -0.3898105  3   1          NA
#> 8  -1.15997739 -1.1005508  3   2  0.05404795
#> 9  -0.57377160  0.5216237  4   1          NA
#> 10 -1.12149743 -0.6417276  4   2 -0.57377160
#> 11 -0.18221248  0.9872356  4   3 -1.12149743
dby(d,y ~ id + order(num), dlag, ARGS=list(k=1:2))
#>              y          x id num       dlag1      dlag2
#> 1  -0.65666950  0.3064051  1   1          NA         NA
#> 2   0.11284083  0.0211384  1   2 -0.65666950         NA
#> 3   2.24915454 -1.1170657  1   3  0.11284083 -0.6566695
#> 4  -0.97774138 -1.5214842  2   1          NA         NA
#> 5  -0.35545945 -0.7846569  2   2 -0.97774138         NA
#> 6   0.58410126 -0.1862686  2   3 -0.35545945 -0.9777414
#> 7   0.05404795 -0.3898105  3   1          NA         NA
#> 8  -1.15997739 -1.1005508  3   2  0.05404795         NA
#> 9  -0.57377160  0.5216237  4   1          NA         NA
#> 10 -1.12149743 -0.6417276  4   2 -0.57377160         NA
#> 11 -0.18221248  0.9872356  4   3 -1.12149743 -0.5737716
dby(d,y ~ id + order(num), dlag, ARGS=list(k=1:2), NAMES=c("l1","l2"))
#>              y          x id num          l1         l2
#> 1  -0.65666950  0.3064051  1   1          NA         NA
#> 2   0.11284083  0.0211384  1   2 -0.65666950         NA
#> 3   2.24915454 -1.1170657  1   3  0.11284083 -0.6566695
#> 4  -0.97774138 -1.5214842  2   1          NA         NA
#> 5  -0.35545945 -0.7846569  2   2 -0.97774138         NA
#> 6   0.58410126 -0.1862686  2   3 -0.35545945 -0.9777414
#> 7   0.05404795 -0.3898105  3   1          NA         NA
#> 8  -1.15997739 -1.1005508  3   2  0.05404795         NA
#> 9  -0.57377160  0.5216237  4   1          NA         NA
#> 10 -1.12149743 -0.6417276  4   2 -0.57377160         NA
#> 11 -0.18221248  0.9872356  4   3 -1.12149743 -0.5737716

dby(d, y~id + order(num), mean=mean, csum=cumsum, n=length)
#>              y          x id num       mean        csum n
#> 1  -0.65666950  0.3064051  1   1  0.5684420 -0.65666950 3
#> 2   0.11284083  0.0211384  1   2  0.5684420 -0.54382867 3
#> 3   2.24915454 -1.1170657  1   3  0.5684420  1.70532587 3
#> 4  -0.97774138 -1.5214842  2   1 -0.2496999 -0.97774138 3
#> 5  -0.35545945 -0.7846569  2   2 -0.2496999 -1.33320083 3
#> 6   0.58410126 -0.1862686  2   3 -0.2496999 -0.74909958 3
#> 7   0.05404795 -0.3898105  3   1 -0.5529647  0.05404795 2
#> 8  -1.15997739 -1.1005508  3   2 -0.5529647 -1.10592944 2
#> 9  -0.57377160  0.5216237  4   1 -0.6258272 -0.57377160 3
#> 10 -1.12149743 -0.6417276  4   2 -0.6258272 -1.69526903 3
#> 11 -0.18221248  0.9872356  4   3 -0.6258272 -1.87748151 3
dby(d2, y~id + order(num), a=cumsum, b=mean, N=length,
  l1=function(x) c(NA,x)[-length(x)]
)
#>              y          x id num           a          b N         l1
#> 1  -0.65666950  0.3064051  1   1 -0.65666950  0.5684420 3         NA
#> 2   0.11284083  0.0211384  1   2 -0.54382867  0.5684420 3 -0.6566695
#> 3   2.24915454 -1.1170657  1   3  1.70532587  0.5684420 3  2.2491545
#> 4  -0.97774138 -1.5214842  2   1 -0.97774138 -0.2496999 3         NA
#> 5  -0.35545945 -0.7846569  2   2 -1.33320083 -0.2496999 3 -0.9777414
#> 6   0.58410126 -0.1862686  2   3 -0.74909958 -0.2496999 3  0.5841013
#> 7   0.05404795 -0.3898105  3   1  0.05404795 -0.5529647 2         NA
#> 8  -1.15997739 -1.1005508  3   2 -1.10592944 -0.5529647 2 -1.1599774
#> 9  -0.57377160  0.5216237  4   1 -0.57377160 -0.6258272 3         NA
#> 10 -1.12149743 -0.6417276  4   2 -1.69526903 -0.6258272 3 -0.5737716
#> 11 -0.18221248  0.9872356  4   3 -1.87748151 -0.6258272 3 -0.1822125

dby(d, y~id + order(num), nn=seq_along, n=length)
#>              y          x id num nn n
#> 1  -0.65666950  0.3064051  1   1  1 3
#> 2   0.11284083  0.0211384  1   2  2 3
#> 3   2.24915454 -1.1170657  1   3  3 3
#> 4  -0.97774138 -1.5214842  2   1  1 3
#> 5  -0.35545945 -0.7846569  2   2  2 3
#> 6   0.58410126 -0.1862686  2   3  3 3
#> 7   0.05404795 -0.3898105  3   1  1 2
#> 8  -1.15997739 -1.1005508  3   2  2 2
#> 9  -0.57377160  0.5216237  4   1  1 3
#> 10 -1.12149743 -0.6417276  4   2  2 3
#> 11 -0.18221248  0.9872356  4   3  3 3
dby(d, y~id + order(num), nn=seq_along, n=length)
#>              y          x id num nn n
#> 1  -0.65666950  0.3064051  1   1  1 3
#> 2   0.11284083  0.0211384  1   2  2 3
#> 3   2.24915454 -1.1170657  1   3  3 3
#> 4  -0.97774138 -1.5214842  2   1  1 3
#> 5  -0.35545945 -0.7846569  2   2  2 3
#> 6   0.58410126 -0.1862686  2   3  3 3
#> 7   0.05404795 -0.3898105  3   1  1 2
#> 8  -1.15997739 -1.1005508  3   2  2 2
#> 9  -0.57377160  0.5216237  4   1  1 3
#> 10 -1.12149743 -0.6417276  4   2  2 3
#> 11 -0.18221248  0.9872356  4   3  3 3

d <- d[,1:4]
dby(d, x<0) <- list(z=mean)
d <- dby(d, is.na(z), z=1)

f <- function(x) apply(x,1,min)
dby(d, y+x~id, min=f)
#> Error: object 'f' not found

dby(d,y+x~id+order(num), function(x) x)
#>              y          x id num         z         _11        _12
#> 1  -0.65666950  0.3064051  1   1 1.0000000 -0.65666950  0.3064051
#> 2   0.11284083  0.0211384  1   2 1.0000000  0.11284083  0.0211384
#> 3   2.24915454 -1.1170657  1   3 0.8761094  2.24915454 -1.1170657
#> 4  -0.97774138 -1.5214842  2   1 0.8761094 -0.97774138 -1.5214842
#> 5  -0.35545945 -0.7846569  2   2 0.8761094 -0.35545945 -0.7846569
#> 6   0.58410126 -0.1862686  2   3 0.8761094  0.58410126 -0.1862686
#> 7   0.05404795 -0.3898105  3   1 0.8761094  0.05404795 -0.3898105
#> 8  -1.15997739 -1.1005508  3   2 0.8761094 -1.15997739 -1.1005508
#> 9  -0.57377160  0.5216237  4   1 1.0000000 -0.57377160  0.5216237
#> 10 -1.12149743 -0.6417276  4   2 0.8761094 -1.12149743 -0.6417276
#> 11 -0.18221248  0.9872356  4   3 1.0000000 -0.18221248  0.9872356

f <- function(x) { cbind(cumsum(x[,1]),cumsum(x[,2]))/sum(x)}
dby(d, y+x~id, f)
#> Error: object 'f' not found

## column-wise
a <- d
dby2(a, mean, median, REGEX=TRUE) <- '^[y|x]'~id
a
#>              y          x id num         z     mean.y     mean.x   median.y
#> 1  -0.65666950  0.3064051  1   1 1.0000000  0.5684420 -0.2631740  0.1128408
#> 2   0.11284083  0.0211384  1   2 1.0000000  0.5684420 -0.2631740  0.1128408
#> 3   2.24915454 -1.1170657  1   3 0.8761094  0.5684420 -0.2631740  0.1128408
#> 4  -0.97774138 -1.5214842  2   1 0.8761094 -0.2496999 -0.8308032 -0.3554594
#> 5  -0.35545945 -0.7846569  2   2 0.8761094 -0.2496999 -0.8308032 -0.3554594
#> 6   0.58410126 -0.1862686  2   3 0.8761094 -0.2496999 -0.8308032 -0.3554594
#> 7   0.05404795 -0.3898105  3   1 0.8761094 -0.5529647 -0.7451807 -0.5529647
#> 8  -1.15997739 -1.1005508  3   2 0.8761094 -0.5529647 -0.7451807 -0.5529647
#> 9  -0.57377160  0.5216237  4   1 1.0000000 -0.6258272  0.2890439 -0.5737716
#> 10 -1.12149743 -0.6417276  4   2 0.8761094 -0.6258272  0.2890439 -0.5737716
#> 11 -0.18221248  0.9872356  4   3 1.0000000 -0.6258272  0.2890439 -0.5737716
#>      median.x
#> 1   0.0211384
#> 2   0.0211384
#> 3   0.0211384
#> 4  -0.7846569
#> 5  -0.7846569
#> 6  -0.7846569
#> 7  -0.7451807
#> 8  -0.7451807
#> 9   0.5216237
#> 10  0.5216237
#> 11  0.5216237
## wildcards 
dby2(a,'y*'+'x*'~id,mean) 
#>              y          x id num         z   median.y   median.x     mean.y
#> 1  -0.65666950  0.3064051  1   1 1.0000000  0.1128408  0.0211384  0.5684420
#> 2   0.11284083  0.0211384  1   2 1.0000000  0.1128408  0.0211384  0.5684420
#> 3   2.24915454 -1.1170657  1   3 0.8761094  0.1128408  0.0211384  0.5684420
#> 4  -0.97774138 -1.5214842  2   1 0.8761094 -0.3554594 -0.7846569 -0.2496999
#> 5  -0.35545945 -0.7846569  2   2 0.8761094 -0.3554594 -0.7846569 -0.2496999
#> 6   0.58410126 -0.1862686  2   3 0.8761094 -0.3554594 -0.7846569 -0.2496999
#> 7   0.05404795 -0.3898105  3   1 0.8761094 -0.5529647 -0.7451807 -0.5529647
#> 8  -1.15997739 -1.1005508  3   2 0.8761094 -0.5529647 -0.7451807 -0.5529647
#> 9  -0.57377160  0.5216237  4   1 1.0000000 -0.5737716  0.5216237 -0.6258272
#> 10 -1.12149743 -0.6417276  4   2 0.8761094 -0.5737716  0.5216237 -0.6258272
#> 11 -0.18221248  0.9872356  4   3 1.0000000 -0.5737716  0.5216237 -0.6258272
#>        mean.x
#> 1  -0.2631740
#> 2  -0.2631740
#> 3  -0.2631740
#> 4  -0.8308032
#> 5  -0.8308032
#> 6  -0.8308032
#> 7  -0.7451807
#> 8  -0.7451807
#> 9   0.2890439
#> 10  0.2890439
#> 11  0.2890439


## subset
dby(d, x<0) <- list(z=NA)
d
#>              y          x id num  z
#> 1  -0.65666950  0.3064051  1   1  1
#> 2   0.11284083  0.0211384  1   2  1
#> 3   2.24915454 -1.1170657  1   3 NA
#> 4  -0.97774138 -1.5214842  2   1 NA
#> 5  -0.35545945 -0.7846569  2   2 NA
#> 6   0.58410126 -0.1862686  2   3 NA
#> 7   0.05404795 -0.3898105  3   1 NA
#> 8  -1.15997739 -1.1005508  3   2 NA
#> 9  -0.57377160  0.5216237  4   1  1
#> 10 -1.12149743 -0.6417276  4   2 NA
#> 11 -0.18221248  0.9872356  4   3  1
dby(d, y~id|x>-1, v=mean,z=1)
#>              y          x id num           v  z
#> 1  -0.65666950  0.3064051  1   1 -0.27191433  1
#> 2   0.11284083  0.0211384  1   2 -0.27191433  1
#> 3   2.24915454 -1.1170657  1   3          NA NA
#> 4  -0.97774138 -1.5214842  2   1          NA NA
#> 5  -0.35545945 -0.7846569  2   2  0.11432090  1
#> 6   0.58410126 -0.1862686  2   3  0.11432090  1
#> 7   0.05404795 -0.3898105  3   1  0.05404795  1
#> 8  -1.15997739 -1.1005508  3   2          NA NA
#> 9  -0.57377160  0.5216237  4   1 -0.62582717  1
#> 10 -1.12149743 -0.6417276  4   2 -0.62582717  1
#> 11 -0.18221248  0.9872356  4   3 -0.62582717  1
dby(d, y+x~id|x>-1, mean, median, COLUMN=TRUE)
#>              y          x id num  z      mean.y     mean.x    median.y
#> 1  -0.65666950  0.3064051  1   1  1 -0.27191433  0.1637718 -0.27191433
#> 2   0.11284083  0.0211384  1   2  1 -0.27191433  0.1637718 -0.27191433
#> 3   2.24915454 -1.1170657  1   3 NA          NA         NA          NA
#> 4  -0.97774138 -1.5214842  2   1 NA          NA         NA          NA
#> 5  -0.35545945 -0.7846569  2   2 NA  0.11432090 -0.4854627  0.11432090
#> 6   0.58410126 -0.1862686  2   3 NA  0.11432090 -0.4854627  0.11432090
#> 7   0.05404795 -0.3898105  3   1 NA  0.05404795 -0.3898105  0.05404795
#> 8  -1.15997739 -1.1005508  3   2 NA          NA         NA          NA
#> 9  -0.57377160  0.5216237  4   1  1 -0.62582717  0.2890439 -0.57377160
#> 10 -1.12149743 -0.6417276  4   2 NA -0.62582717  0.2890439 -0.57377160
#> 11 -0.18221248  0.9872356  4   3  1 -0.62582717  0.2890439 -0.57377160
#>      median.x
#> 1   0.1637718
#> 2   0.1637718
#> 3          NA
#> 4          NA
#> 5  -0.4854627
#> 6  -0.4854627
#> 7  -0.3898105
#> 8          NA
#> 9   0.5216237
#> 10  0.5216237
#> 11  0.5216237

dby2(d, y+x~id|x>0, mean, REDUCE=TRUE)
#>   id     mean.y    mean.x
#> 1  1 -0.2719143 0.1637718
#> 2  4 -0.3779920 0.7544297

dby(d,y~id|x<0,mean,ALL=FALSE)
#>              y          x id num  z       mean
#> 3   2.24915454 -1.1170657  1   3 NA  2.2491545
#> 4  -0.97774138 -1.5214842  2   1 NA -0.2496999
#> 5  -0.35545945 -0.7846569  2   2 NA -0.2496999
#> 6   0.58410126 -0.1862686  2   3 NA -0.2496999
#> 7   0.05404795 -0.3898105  3   1 NA -0.5529647
#> 8  -1.15997739 -1.1005508  3   2 NA -0.5529647
#> 10 -1.12149743 -0.6417276  4   2 NA -1.1214974

a <- iris
a <- dby(a,y=1)
dby(a,Species=="versicolor") <- list(y=2)