Calculate summary statistics grouped by variable
Usage
dby(
data,
INPUT,
...,
ID = NULL,
ORDER = NULL,
SUBSET = NULL,
SORT = 0,
COMBINE = !REDUCE,
NOCHECK = FALSE,
ARGS = NULL,
NAMES,
COLUMN = FALSE,
REDUCE = FALSE,
REGEX = mets.options()$regex,
ALL = TRUE
)Arguments
- data
Data.frame
- INPUT
Input variables (character or formula)
- ...
functions
- ID
id variable
- ORDER
(optional) order variable
- SUBSET
(optional) subset expression
- SORT
sort order (id+order variable)
- COMBINE
If TRUE result is appended to data
- NOCHECK
No sorting or check for missing data
- ARGS
Optional list of arguments to functions (...)
- NAMES
Optional vector of column names
- COLUMN
If TRUE do the calculations for each column
- REDUCE
Reduce number of redundant rows
- REGEX
Allow regular expressions
- ALL
if FALSE only the subset will be returned
Examples
n <- 4
k <- c(3,rbinom(n-1,3,0.5)+1)
N <- sum(k)
d <- data.frame(y=rnorm(N),x=rnorm(N),
id=rep(seq(n),k),num=unlist(sapply(k,seq))
)
d2 <- d[sample(nrow(d)),]
dby(d2, y~id, mean)
#> y x id num mean
#> 2 -1.3171416 0.3308853 1 2 -0.8564182
#> 1 0.2246146 1.2689582 1 1 -0.8564182
#> 3 -1.4767275 -1.1121771 1 3 -0.8564182
#> 5 -0.4845239 -0.5940380 2 2 0.3065174
#> 4 -1.2150233 0.7257837 2 1 0.3065174
#> 6 2.6190994 0.8263667 2 3 0.3065174
#> 8 -1.9064389 -0.7110099 3 2 -0.3917604
#> 9 1.0435257 -1.9165302 3 3 -0.3917604
#> 7 -0.3123679 -1.2821343 3 1 -0.3917604
#> 10 -0.8944838 -0.1237129 4 1 -0.7169671
#> 11 -0.5394503 0.5109017 4 2 -0.7169671
dby(d2, y~id + order(num), cumsum)
#> y x id num cumsum
#> 1 0.2246146 1.2689582 1 1 0.2246146
#> 2 -1.3171416 0.3308853 1 2 -1.0925270
#> 3 -1.4767275 -1.1121771 1 3 -2.5692545
#> 4 -1.2150233 0.7257837 2 1 -1.2150233
#> 5 -0.4845239 -0.5940380 2 2 -1.6995472
#> 6 2.6190994 0.8263667 2 3 0.9195522
#> 7 -0.3123679 -1.2821343 3 1 -0.3123679
#> 8 -1.9064389 -0.7110099 3 2 -2.2188069
#> 9 1.0435257 -1.9165302 3 3 -1.1752811
#> 10 -0.8944838 -0.1237129 4 1 -0.8944838
#> 11 -0.5394503 0.5109017 4 2 -1.4339341
dby(d,y ~ id + order(num), dlag)
#> y x id num dlag
#> 1 0.2246146 1.2689582 1 1 NA
#> 2 -1.3171416 0.3308853 1 2 0.2246146
#> 3 -1.4767275 -1.1121771 1 3 -1.3171416
#> 4 -1.2150233 0.7257837 2 1 NA
#> 5 -0.4845239 -0.5940380 2 2 -1.2150233
#> 6 2.6190994 0.8263667 2 3 -0.4845239
#> 7 -0.3123679 -1.2821343 3 1 NA
#> 8 -1.9064389 -0.7110099 3 2 -0.3123679
#> 9 1.0435257 -1.9165302 3 3 -1.9064389
#> 10 -0.8944838 -0.1237129 4 1 NA
#> 11 -0.5394503 0.5109017 4 2 -0.8944838
dby(d,y ~ id + order(num), dlag, ARGS=list(k=1:2))
#> y x id num dlag1 dlag2
#> 1 0.2246146 1.2689582 1 1 NA NA
#> 2 -1.3171416 0.3308853 1 2 0.2246146 NA
#> 3 -1.4767275 -1.1121771 1 3 -1.3171416 0.2246146
#> 4 -1.2150233 0.7257837 2 1 NA NA
#> 5 -0.4845239 -0.5940380 2 2 -1.2150233 NA
#> 6 2.6190994 0.8263667 2 3 -0.4845239 -1.2150233
#> 7 -0.3123679 -1.2821343 3 1 NA NA
#> 8 -1.9064389 -0.7110099 3 2 -0.3123679 NA
#> 9 1.0435257 -1.9165302 3 3 -1.9064389 -0.3123679
#> 10 -0.8944838 -0.1237129 4 1 NA NA
#> 11 -0.5394503 0.5109017 4 2 -0.8944838 NA
dby(d,y ~ id + order(num), dlag, ARGS=list(k=1:2), NAMES=c("l1","l2"))
#> y x id num l1 l2
#> 1 0.2246146 1.2689582 1 1 NA NA
#> 2 -1.3171416 0.3308853 1 2 0.2246146 NA
#> 3 -1.4767275 -1.1121771 1 3 -1.3171416 0.2246146
#> 4 -1.2150233 0.7257837 2 1 NA NA
#> 5 -0.4845239 -0.5940380 2 2 -1.2150233 NA
#> 6 2.6190994 0.8263667 2 3 -0.4845239 -1.2150233
#> 7 -0.3123679 -1.2821343 3 1 NA NA
#> 8 -1.9064389 -0.7110099 3 2 -0.3123679 NA
#> 9 1.0435257 -1.9165302 3 3 -1.9064389 -0.3123679
#> 10 -0.8944838 -0.1237129 4 1 NA NA
#> 11 -0.5394503 0.5109017 4 2 -0.8944838 NA
dby(d, y~id + order(num), mean=mean, csum=cumsum, n=length)
#> y x id num mean csum n
#> 1 0.2246146 1.2689582 1 1 -0.8564182 0.2246146 3
#> 2 -1.3171416 0.3308853 1 2 -0.8564182 -1.0925270 3
#> 3 -1.4767275 -1.1121771 1 3 -0.8564182 -2.5692545 3
#> 4 -1.2150233 0.7257837 2 1 0.3065174 -1.2150233 3
#> 5 -0.4845239 -0.5940380 2 2 0.3065174 -1.6995472 3
#> 6 2.6190994 0.8263667 2 3 0.3065174 0.9195522 3
#> 7 -0.3123679 -1.2821343 3 1 -0.3917604 -0.3123679 3
#> 8 -1.9064389 -0.7110099 3 2 -0.3917604 -2.2188069 3
#> 9 1.0435257 -1.9165302 3 3 -0.3917604 -1.1752811 3
#> 10 -0.8944838 -0.1237129 4 1 -0.7169671 -0.8944838 2
#> 11 -0.5394503 0.5109017 4 2 -0.7169671 -1.4339341 2
dby(d2, y~id + order(num), a=cumsum, b=mean, N=length,
l1=function(x) c(NA,x)[-length(x)]
)
#> y x id num a b N l1
#> 1 0.2246146 1.2689582 1 1 0.2246146 -0.8564182 3 NA
#> 2 -1.3171416 0.3308853 1 2 -1.0925270 -0.8564182 3 0.2246146
#> 3 -1.4767275 -1.1121771 1 3 -2.5692545 -0.8564182 3 -1.4767275
#> 4 -1.2150233 0.7257837 2 1 -1.2150233 0.3065174 3 NA
#> 5 -0.4845239 -0.5940380 2 2 -1.6995472 0.3065174 3 -1.2150233
#> 6 2.6190994 0.8263667 2 3 0.9195522 0.3065174 3 2.6190994
#> 7 -0.3123679 -1.2821343 3 1 -0.3123679 -0.3917604 3 NA
#> 8 -1.9064389 -0.7110099 3 2 -2.2188069 -0.3917604 3 -0.3123679
#> 9 1.0435257 -1.9165302 3 3 -1.1752811 -0.3917604 3 1.0435257
#> 10 -0.8944838 -0.1237129 4 1 -0.8944838 -0.7169671 2 NA
#> 11 -0.5394503 0.5109017 4 2 -1.4339341 -0.7169671 2 -0.5394503
dby(d, y~id + order(num), nn=seq_along, n=length)
#> y x id num nn n
#> 1 0.2246146 1.2689582 1 1 1 3
#> 2 -1.3171416 0.3308853 1 2 2 3
#> 3 -1.4767275 -1.1121771 1 3 3 3
#> 4 -1.2150233 0.7257837 2 1 1 3
#> 5 -0.4845239 -0.5940380 2 2 2 3
#> 6 2.6190994 0.8263667 2 3 3 3
#> 7 -0.3123679 -1.2821343 3 1 1 3
#> 8 -1.9064389 -0.7110099 3 2 2 3
#> 9 1.0435257 -1.9165302 3 3 3 3
#> 10 -0.8944838 -0.1237129 4 1 1 2
#> 11 -0.5394503 0.5109017 4 2 2 2
dby(d, y~id + order(num), nn=seq_along, n=length)
#> y x id num nn n
#> 1 0.2246146 1.2689582 1 1 1 3
#> 2 -1.3171416 0.3308853 1 2 2 3
#> 3 -1.4767275 -1.1121771 1 3 3 3
#> 4 -1.2150233 0.7257837 2 1 1 3
#> 5 -0.4845239 -0.5940380 2 2 2 3
#> 6 2.6190994 0.8263667 2 3 3 3
#> 7 -0.3123679 -1.2821343 3 1 1 3
#> 8 -1.9064389 -0.7110099 3 2 2 3
#> 9 1.0435257 -1.9165302 3 3 3 3
#> 10 -0.8944838 -0.1237129 4 1 1 2
#> 11 -0.5394503 0.5109017 4 2 2 2
d <- d[,1:4]
dby(d, x<0) <- list(z=mean)
d <- dby(d, is.na(z), z=1)
f <- function(x) apply(x,1,min)
dby(d, y+x~id, min=f)
#> Error: object 'f' not found
dby(d,y+x~id+order(num), function(x) x)
#> y x id num z _11 _12
#> 1 0.2246146 1.2689582 1 1 1.0000000 0.2246146 1.2689582
#> 2 -1.3171416 0.3308853 1 2 1.0000000 -1.3171416 0.3308853
#> 3 -1.4767275 -1.1121771 1 3 0.7595576 -1.4767275 -1.1121771
#> 4 -1.2150233 0.7257837 2 1 1.0000000 -1.2150233 0.7257837
#> 5 -0.4845239 -0.5940380 2 2 0.7595576 -0.4845239 -0.5940380
#> 6 2.6190994 0.8263667 2 3 1.0000000 2.6190994 0.8263667
#> 7 -0.3123679 -1.2821343 3 1 0.7595576 -0.3123679 -1.2821343
#> 8 -1.9064389 -0.7110099 3 2 0.7595576 -1.9064389 -0.7110099
#> 9 1.0435257 -1.9165302 3 3 0.7595576 1.0435257 -1.9165302
#> 10 -0.8944838 -0.1237129 4 1 0.7595576 -0.8944838 -0.1237129
#> 11 -0.5394503 0.5109017 4 2 1.0000000 -0.5394503 0.5109017
f <- function(x) { cbind(cumsum(x[,1]),cumsum(x[,2]))/sum(x)}
dby(d, y+x~id, f)
#> Error: object 'f' not found
## column-wise
a <- d
dby2(a, mean, median, REGEX=TRUE) <- '^[y|x]'~id
a
#> y x id num z mean.y mean.x median.y
#> 1 0.2246146 1.2689582 1 1 1.0000000 -0.8564182 0.1625555 -1.3171416
#> 2 -1.3171416 0.3308853 1 2 1.0000000 -0.8564182 0.1625555 -1.3171416
#> 3 -1.4767275 -1.1121771 1 3 0.7595576 -0.8564182 0.1625555 -1.3171416
#> 4 -1.2150233 0.7257837 2 1 1.0000000 0.3065174 0.3193708 -0.4845239
#> 5 -0.4845239 -0.5940380 2 2 0.7595576 0.3065174 0.3193708 -0.4845239
#> 6 2.6190994 0.8263667 2 3 1.0000000 0.3065174 0.3193708 -0.4845239
#> 7 -0.3123679 -1.2821343 3 1 0.7595576 -0.3917604 -1.3032248 -0.3123679
#> 8 -1.9064389 -0.7110099 3 2 0.7595576 -0.3917604 -1.3032248 -0.3123679
#> 9 1.0435257 -1.9165302 3 3 0.7595576 -0.3917604 -1.3032248 -0.3123679
#> 10 -0.8944838 -0.1237129 4 1 0.7595576 -0.7169671 0.1935944 -0.7169671
#> 11 -0.5394503 0.5109017 4 2 1.0000000 -0.7169671 0.1935944 -0.7169671
#> median.x
#> 1 0.3308853
#> 2 0.3308853
#> 3 0.3308853
#> 4 0.7257837
#> 5 0.7257837
#> 6 0.7257837
#> 7 -1.2821343
#> 8 -1.2821343
#> 9 -1.2821343
#> 10 0.1935944
#> 11 0.1935944
## wildcards
dby2(a,'y*'+'x*'~id,mean)
#> y x id num z median.y median.x mean.y
#> 1 0.2246146 1.2689582 1 1 1.0000000 -1.3171416 0.3308853 -0.8564182
#> 2 -1.3171416 0.3308853 1 2 1.0000000 -1.3171416 0.3308853 -0.8564182
#> 3 -1.4767275 -1.1121771 1 3 0.7595576 -1.3171416 0.3308853 -0.8564182
#> 4 -1.2150233 0.7257837 2 1 1.0000000 -0.4845239 0.7257837 0.3065174
#> 5 -0.4845239 -0.5940380 2 2 0.7595576 -0.4845239 0.7257837 0.3065174
#> 6 2.6190994 0.8263667 2 3 1.0000000 -0.4845239 0.7257837 0.3065174
#> 7 -0.3123679 -1.2821343 3 1 0.7595576 -0.3123679 -1.2821343 -0.3917604
#> 8 -1.9064389 -0.7110099 3 2 0.7595576 -0.3123679 -1.2821343 -0.3917604
#> 9 1.0435257 -1.9165302 3 3 0.7595576 -0.3123679 -1.2821343 -0.3917604
#> 10 -0.8944838 -0.1237129 4 1 0.7595576 -0.7169671 0.1935944 -0.7169671
#> 11 -0.5394503 0.5109017 4 2 1.0000000 -0.7169671 0.1935944 -0.7169671
#> mean.x
#> 1 0.1625555
#> 2 0.1625555
#> 3 0.1625555
#> 4 0.3193708
#> 5 0.3193708
#> 6 0.3193708
#> 7 -1.3032248
#> 8 -1.3032248
#> 9 -1.3032248
#> 10 0.1935944
#> 11 0.1935944
## subset
dby(d, x<0) <- list(z=NA)
d
#> y x id num z
#> 1 0.2246146 1.2689582 1 1 1
#> 2 -1.3171416 0.3308853 1 2 1
#> 3 -1.4767275 -1.1121771 1 3 NA
#> 4 -1.2150233 0.7257837 2 1 1
#> 5 -0.4845239 -0.5940380 2 2 NA
#> 6 2.6190994 0.8263667 2 3 1
#> 7 -0.3123679 -1.2821343 3 1 NA
#> 8 -1.9064389 -0.7110099 3 2 NA
#> 9 1.0435257 -1.9165302 3 3 NA
#> 10 -0.8944838 -0.1237129 4 1 NA
#> 11 -0.5394503 0.5109017 4 2 1
dby(d, y~id|x>-1, v=mean,z=1)
#> y x id num v z
#> 1 0.2246146 1.2689582 1 1 -0.5462635 1
#> 2 -1.3171416 0.3308853 1 2 -0.5462635 1
#> 3 -1.4767275 -1.1121771 1 3 NA NA
#> 4 -1.2150233 0.7257837 2 1 0.3065174 1
#> 5 -0.4845239 -0.5940380 2 2 0.3065174 1
#> 6 2.6190994 0.8263667 2 3 0.3065174 1
#> 7 -0.3123679 -1.2821343 3 1 NA NA
#> 8 -1.9064389 -0.7110099 3 2 -1.9064389 1
#> 9 1.0435257 -1.9165302 3 3 NA NA
#> 10 -0.8944838 -0.1237129 4 1 -0.7169671 1
#> 11 -0.5394503 0.5109017 4 2 -0.7169671 1
dby(d, y+x~id|x>-1, mean, median, COLUMN=TRUE)
#> y x id num z mean.y mean.x median.y median.x
#> 1 0.2246146 1.2689582 1 1 1 -0.5462635 0.7999218 -0.5462635 0.7999218
#> 2 -1.3171416 0.3308853 1 2 1 -0.5462635 0.7999218 -0.5462635 0.7999218
#> 3 -1.4767275 -1.1121771 1 3 NA NA NA NA NA
#> 4 -1.2150233 0.7257837 2 1 1 0.3065174 0.3193708 -0.4845239 0.7257837
#> 5 -0.4845239 -0.5940380 2 2 NA 0.3065174 0.3193708 -0.4845239 0.7257837
#> 6 2.6190994 0.8263667 2 3 1 0.3065174 0.3193708 -0.4845239 0.7257837
#> 7 -0.3123679 -1.2821343 3 1 NA NA NA NA NA
#> 8 -1.9064389 -0.7110099 3 2 NA -1.9064389 -0.7110099 -1.9064389 -0.7110099
#> 9 1.0435257 -1.9165302 3 3 NA NA NA NA NA
#> 10 -0.8944838 -0.1237129 4 1 NA -0.7169671 0.1935944 -0.7169671 0.1935944
#> 11 -0.5394503 0.5109017 4 2 1 -0.7169671 0.1935944 -0.7169671 0.1935944
dby2(d, y+x~id|x>0, mean, REDUCE=TRUE)
#> id mean.y mean.x
#> 1 1 -0.5462635 0.7999218
#> 2 2 0.7020380 0.7760752
#> 3 4 -0.5394503 0.5109017
dby(d,y~id|x<0,mean,ALL=FALSE)
#> y x id num z mean
#> 3 -1.4767275 -1.1121771 1 3 NA -1.4767275
#> 5 -0.4845239 -0.5940380 2 2 NA -0.4845239
#> 7 -0.3123679 -1.2821343 3 1 NA -0.3917604
#> 8 -1.9064389 -0.7110099 3 2 NA -0.3917604
#> 9 1.0435257 -1.9165302 3 3 NA -0.3917604
#> 10 -0.8944838 -0.1237129 4 1 NA -0.8944838
a <- iris
a <- dby(a,y=1)
dby(a,Species=="versicolor") <- list(y=2)
