Skip to contents

Calculate summary statistics grouped by variable

Usage

dby(
  data,
  INPUT,
  ...,
  ID = NULL,
  ORDER = NULL,
  SUBSET = NULL,
  SORT = 0,
  COMBINE = !REDUCE,
  NOCHECK = FALSE,
  ARGS = NULL,
  NAMES,
  COLUMN = FALSE,
  REDUCE = FALSE,
  REGEX = mets.options()$regex,
  ALL = TRUE
)

Arguments

data

Data.frame

INPUT

Input variables (character or formula)

...

functions

ID

id variable

ORDER

(optional) order variable

SUBSET

(optional) subset expression

SORT

sort order (id+order variable)

COMBINE

If TRUE result is appended to data

NOCHECK

No sorting or check for missing data

ARGS

Optional list of arguments to functions (...)

NAMES

Optional vector of column names

COLUMN

If TRUE do the calculations for each column

REDUCE

Reduce number of redundant rows

REGEX

Allow regular expressions

ALL

if FALSE only the subset will be returned

Details

Calculate summary statistics grouped by

dby2 for column-wise calculations

Author

Klaus K. Holst and Thomas Scheike

Examples

n <- 4
k <- c(3,rbinom(n-1,3,0.5)+1)
N <- sum(k)
d <- data.frame(y=rnorm(N),x=rnorm(N),
   id=rep(seq(n),k),num=unlist(sapply(k,seq))
)
d2 <- d[sample(nrow(d)),]

dby(d2, y~id, mean)
#>            y          x id num       mean
#> 3  2.6190994  0.7257837  1   3  0.3065174
#> 1 -1.2150233  0.3308853  1   1  0.3065174
#> 2 -0.4845239 -1.1121771  1   2  0.3065174
#> 5 -1.9064389  0.8263667  2   2 -0.3917604
#> 4 -0.3123679 -0.5940380  2   1 -0.3917604
#> 6  1.0435257 -1.2821343  2   3 -0.3917604
#> 7 -0.8944838 -0.7110099  3   1 -0.8944838
#> 9  1.2689582 -0.1237129  4   2  0.3647540
#> 8 -0.5394503 -1.9165302  4   1  0.3647540
dby(d2, y~id + order(num), cumsum)
#>            y          x id num     cumsum
#> 1 -1.2150233  0.3308853  1   1 -1.2150233
#> 2 -0.4845239 -1.1121771  1   2 -1.6995472
#> 3  2.6190994  0.7257837  1   3  0.9195522
#> 4 -0.3123679 -0.5940380  2   1 -0.3123679
#> 5 -1.9064389  0.8263667  2   2 -2.2188069
#> 6  1.0435257 -1.2821343  2   3 -1.1752811
#> 7 -0.8944838 -0.7110099  3   1 -0.8944838
#> 8 -0.5394503 -1.9165302  4   1 -0.5394503
#> 9  1.2689582 -0.1237129  4   2  0.7295079

dby(d,y ~ id + order(num), dlag)
#>            y          x id num       dlag
#> 1 -1.2150233  0.3308853  1   1         NA
#> 2 -0.4845239 -1.1121771  1   2 -1.2150233
#> 3  2.6190994  0.7257837  1   3 -0.4845239
#> 4 -0.3123679 -0.5940380  2   1         NA
#> 5 -1.9064389  0.8263667  2   2 -0.3123679
#> 6  1.0435257 -1.2821343  2   3 -1.9064389
#> 7 -0.8944838 -0.7110099  3   1         NA
#> 8 -0.5394503 -1.9165302  4   1         NA
#> 9  1.2689582 -0.1237129  4   2 -0.5394503
dby(d,y ~ id + order(num), dlag, ARGS=list(k=1:2))
#>            y          x id num      dlag1      dlag2
#> 1 -1.2150233  0.3308853  1   1         NA         NA
#> 2 -0.4845239 -1.1121771  1   2 -1.2150233         NA
#> 3  2.6190994  0.7257837  1   3 -0.4845239 -1.2150233
#> 4 -0.3123679 -0.5940380  2   1         NA         NA
#> 5 -1.9064389  0.8263667  2   2 -0.3123679         NA
#> 6  1.0435257 -1.2821343  2   3 -1.9064389 -0.3123679
#> 7 -0.8944838 -0.7110099  3   1         NA         NA
#> 8 -0.5394503 -1.9165302  4   1         NA         NA
#> 9  1.2689582 -0.1237129  4   2 -0.5394503         NA
dby(d,y ~ id + order(num), dlag, ARGS=list(k=1:2), NAMES=c("l1","l2"))
#>            y          x id num         l1         l2
#> 1 -1.2150233  0.3308853  1   1         NA         NA
#> 2 -0.4845239 -1.1121771  1   2 -1.2150233         NA
#> 3  2.6190994  0.7257837  1   3 -0.4845239 -1.2150233
#> 4 -0.3123679 -0.5940380  2   1         NA         NA
#> 5 -1.9064389  0.8263667  2   2 -0.3123679         NA
#> 6  1.0435257 -1.2821343  2   3 -1.9064389 -0.3123679
#> 7 -0.8944838 -0.7110099  3   1         NA         NA
#> 8 -0.5394503 -1.9165302  4   1         NA         NA
#> 9  1.2689582 -0.1237129  4   2 -0.5394503         NA

dby(d, y~id + order(num), mean=mean, csum=cumsum, n=length)
#>            y          x id num       mean       csum n
#> 1 -1.2150233  0.3308853  1   1  0.3065174 -1.2150233 3
#> 2 -0.4845239 -1.1121771  1   2  0.3065174 -1.6995472 3
#> 3  2.6190994  0.7257837  1   3  0.3065174  0.9195522 3
#> 4 -0.3123679 -0.5940380  2   1 -0.3917604 -0.3123679 3
#> 5 -1.9064389  0.8263667  2   2 -0.3917604 -2.2188069 3
#> 6  1.0435257 -1.2821343  2   3 -0.3917604 -1.1752811 3
#> 7 -0.8944838 -0.7110099  3   1 -0.8944838 -0.8944838 1
#> 8 -0.5394503 -1.9165302  4   1  0.3647540 -0.5394503 2
#> 9  1.2689582 -0.1237129  4   2  0.3647540  0.7295079 2
dby(d2, y~id + order(num), a=cumsum, b=mean, N=length,
  l1=function(x) c(NA,x)[-length(x)]
)
#>            y          x id num          a          b N         l1
#> 1 -1.2150233  0.3308853  1   1 -1.2150233  0.3065174 3         NA
#> 2 -0.4845239 -1.1121771  1   2 -1.6995472  0.3065174 3 -1.2150233
#> 3  2.6190994  0.7257837  1   3  0.9195522  0.3065174 3  2.6190994
#> 4 -0.3123679 -0.5940380  2   1 -0.3123679 -0.3917604 3         NA
#> 5 -1.9064389  0.8263667  2   2 -2.2188069 -0.3917604 3 -0.3123679
#> 6  1.0435257 -1.2821343  2   3 -1.1752811 -0.3917604 3  1.0435257
#> 7 -0.8944838 -0.7110099  3   1 -0.8944838 -0.8944838 1 -0.8944838
#> 8 -0.5394503 -1.9165302  4   1 -0.5394503  0.3647540 2         NA
#> 9  1.2689582 -0.1237129  4   2  0.7295079  0.3647540 2  1.2689582

dby(d, y~id + order(num), nn=seq_along, n=length)
#>            y          x id num nn n
#> 1 -1.2150233  0.3308853  1   1  1 3
#> 2 -0.4845239 -1.1121771  1   2  2 3
#> 3  2.6190994  0.7257837  1   3  3 3
#> 4 -0.3123679 -0.5940380  2   1  1 3
#> 5 -1.9064389  0.8263667  2   2  2 3
#> 6  1.0435257 -1.2821343  2   3  3 3
#> 7 -0.8944838 -0.7110099  3   1  1 1
#> 8 -0.5394503 -1.9165302  4   1  1 2
#> 9  1.2689582 -0.1237129  4   2  2 2
dby(d, y~id + order(num), nn=seq_along, n=length)
#>            y          x id num nn n
#> 1 -1.2150233  0.3308853  1   1  1 3
#> 2 -0.4845239 -1.1121771  1   2  2 3
#> 3  2.6190994  0.7257837  1   3  3 3
#> 4 -0.3123679 -0.5940380  2   1  1 3
#> 5 -1.9064389  0.8263667  2   2  2 3
#> 6  1.0435257 -1.2821343  2   3  3 3
#> 7 -0.8944838 -0.7110099  3   1  1 1
#> 8 -0.5394503 -1.9165302  4   1  1 2
#> 9  1.2689582 -0.1237129  4   2  2 2

d <- d[,1:4]
dby(d, x<0) <- list(z=mean)
d <- dby(d, is.na(z), z=1)

f <- function(x) apply(x,1,min)
dby(d, y+x~id, min=f)
#> Error: object 'f' not found

dby(d,y+x~id+order(num), function(x) x)
#>            y          x id num         z        _11        _12
#> 1 -1.2150233  0.3308853  1   1 1.0000000 -1.2150233  0.3308853
#> 2 -0.4845239 -1.1121771  1   2 0.8475856 -0.4845239 -1.1121771
#> 3  2.6190994  0.7257837  1   3 1.0000000  2.6190994  0.7257837
#> 4 -0.3123679 -0.5940380  2   1 0.8475856 -0.3123679 -0.5940380
#> 5 -1.9064389  0.8263667  2   2 1.0000000 -1.9064389  0.8263667
#> 6  1.0435257 -1.2821343  2   3 0.8475856  1.0435257 -1.2821343
#> 7 -0.8944838 -0.7110099  3   1 0.8475856 -0.8944838 -0.7110099
#> 8 -0.5394503 -1.9165302  4   1 0.8475856 -0.5394503 -1.9165302
#> 9  1.2689582 -0.1237129  4   2 0.8475856  1.2689582 -0.1237129

f <- function(x) { cbind(cumsum(x[,1]),cumsum(x[,2]))/sum(x)}
dby(d, y+x~id, f)
#> Error: object 'f' not found

## column-wise
a <- d
dby2(a, mean, median, REGEX=TRUE) <- '^[y|x]'~id
a
#>            y          x id num         z     mean.y      mean.x   median.y
#> 1 -1.2150233  0.3308853  1   1 1.0000000  0.3065174 -0.01850271 -0.4845239
#> 2 -0.4845239 -1.1121771  1   2 0.8475856  0.3065174 -0.01850271 -0.4845239
#> 3  2.6190994  0.7257837  1   3 1.0000000  0.3065174 -0.01850271 -0.4845239
#> 4 -0.3123679 -0.5940380  2   1 0.8475856 -0.3917604 -0.34993519 -0.3123679
#> 5 -1.9064389  0.8263667  2   2 1.0000000 -0.3917604 -0.34993519 -0.3123679
#> 6  1.0435257 -1.2821343  2   3 0.8475856 -0.3917604 -0.34993519 -0.3123679
#> 7 -0.8944838 -0.7110099  3   1 0.8475856 -0.8944838 -0.71100994 -0.8944838
#> 8 -0.5394503 -1.9165302  4   1 0.8475856  0.3647540 -1.02012154  0.3647540
#> 9  1.2689582 -0.1237129  4   2 0.8475856  0.3647540 -1.02012154  0.3647540
#>     median.x
#> 1  0.3308853
#> 2  0.3308853
#> 3  0.3308853
#> 4 -0.5940380
#> 5 -0.5940380
#> 6 -0.5940380
#> 7 -0.7110099
#> 8 -1.0201215
#> 9 -1.0201215
## wildcards 
dby2(a,'y*'+'x*'~id,mean) 
#>            y          x id num         z   median.y   median.x     mean.y
#> 1 -1.2150233  0.3308853  1   1 1.0000000 -0.4845239  0.3308853  0.3065174
#> 2 -0.4845239 -1.1121771  1   2 0.8475856 -0.4845239  0.3308853  0.3065174
#> 3  2.6190994  0.7257837  1   3 1.0000000 -0.4845239  0.3308853  0.3065174
#> 4 -0.3123679 -0.5940380  2   1 0.8475856 -0.3123679 -0.5940380 -0.3917604
#> 5 -1.9064389  0.8263667  2   2 1.0000000 -0.3123679 -0.5940380 -0.3917604
#> 6  1.0435257 -1.2821343  2   3 0.8475856 -0.3123679 -0.5940380 -0.3917604
#> 7 -0.8944838 -0.7110099  3   1 0.8475856 -0.8944838 -0.7110099 -0.8944838
#> 8 -0.5394503 -1.9165302  4   1 0.8475856  0.3647540 -1.0201215  0.3647540
#> 9  1.2689582 -0.1237129  4   2 0.8475856  0.3647540 -1.0201215  0.3647540
#>        mean.x
#> 1 -0.01850271
#> 2 -0.01850271
#> 3 -0.01850271
#> 4 -0.34993519
#> 5 -0.34993519
#> 6 -0.34993519
#> 7 -0.71100994
#> 8 -1.02012154
#> 9 -1.02012154


## subset
dby(d, x<0) <- list(z=NA)
d
#>            y          x id num  z
#> 1 -1.2150233  0.3308853  1   1  1
#> 2 -0.4845239 -1.1121771  1   2 NA
#> 3  2.6190994  0.7257837  1   3  1
#> 4 -0.3123679 -0.5940380  2   1 NA
#> 5 -1.9064389  0.8263667  2   2  1
#> 6  1.0435257 -1.2821343  2   3 NA
#> 7 -0.8944838 -0.7110099  3   1 NA
#> 8 -0.5394503 -1.9165302  4   1 NA
#> 9  1.2689582 -0.1237129  4   2 NA
dby(d, y~id|x>-1, v=mean,z=1)
#>            y          x id num          v  z
#> 1 -1.2150233  0.3308853  1   1  0.7020380  1
#> 2 -0.4845239 -1.1121771  1   2         NA NA
#> 3  2.6190994  0.7257837  1   3  0.7020380  1
#> 4 -0.3123679 -0.5940380  2   1 -1.1094034  1
#> 5 -1.9064389  0.8263667  2   2 -1.1094034  1
#> 6  1.0435257 -1.2821343  2   3         NA NA
#> 7 -0.8944838 -0.7110099  3   1 -0.8944838  1
#> 8 -0.5394503 -1.9165302  4   1         NA NA
#> 9  1.2689582 -0.1237129  4   2  1.2689582  1
dby(d, y+x~id|x>-1, mean, median, COLUMN=TRUE)
#>            y          x id num  z     mean.y     mean.x   median.y   median.x
#> 1 -1.2150233  0.3308853  1   1  1  0.7020380  0.5283345  0.7020380  0.5283345
#> 2 -0.4845239 -1.1121771  1   2 NA         NA         NA         NA         NA
#> 3  2.6190994  0.7257837  1   3  1  0.7020380  0.5283345  0.7020380  0.5283345
#> 4 -0.3123679 -0.5940380  2   1 NA -1.1094034  0.1161644 -1.1094034  0.1161644
#> 5 -1.9064389  0.8263667  2   2  1 -1.1094034  0.1161644 -1.1094034  0.1161644
#> 6  1.0435257 -1.2821343  2   3 NA         NA         NA         NA         NA
#> 7 -0.8944838 -0.7110099  3   1 NA -0.8944838 -0.7110099 -0.8944838 -0.7110099
#> 8 -0.5394503 -1.9165302  4   1 NA         NA         NA         NA         NA
#> 9  1.2689582 -0.1237129  4   2 NA  1.2689582 -0.1237129  1.2689582 -0.1237129

dby2(d, y+x~id|x>0, mean, REDUCE=TRUE)
#>   id    mean.y    mean.x
#> 1  1  0.702038 0.5283345
#> 2  2 -1.906439 0.8263667

dby(d,y~id|x<0,mean,ALL=FALSE)
#>            y          x id num  z       mean
#> 2 -0.4845239 -1.1121771  1   2 NA -0.4845239
#> 4 -0.3123679 -0.5940380  2   1 NA  0.3655789
#> 6  1.0435257 -1.2821343  2   3 NA  0.3655789
#> 7 -0.8944838 -0.7110099  3   1 NA -0.8944838
#> 8 -0.5394503 -1.9165302  4   1 NA  0.3647540
#> 9  1.2689582 -0.1237129  4   2 NA  0.3647540

a <- iris
a <- dby(a,y=1)
dby(a,Species=="versicolor") <- list(y=2)