Simulate model — sim • lava

Simulate data from a general SEM model including non-linear effects and general link and distribution of variables.

# S3 method for class 'lvm'
sim(x, n = NULL, p = NULL, normal = FALSE, cond = FALSE,
sigma = 1, rho = 0.5, X = NULL, unlink=FALSE, latent=TRUE,
use.labels = TRUE, seed=NULL, ...)

Arguments

x: Model object
...: Additional arguments to be passed to the low level functions
n: Number of simulated values/individuals
p: Parameter value (optional)
normal: Logical indicating whether to simulate data from a multivariate normal distribution conditional on exogenous variables hence ignoring functional/distribution definition
cond: for internal use
sigma: Default residual variance (1)
rho: Default covariance parameter (0.5)
X: Optional matrix of fixed values of variables (manipulation)
unlink: Return Inverse link transformed data
latent: Include latent variables (default TRUE)
use.labels: convert categorical variables to factors before applying transformation
seed: Random seed

Author

Klaus K. Holst

Examples

##################################################
## Logistic regression
##################################################
m <- lvm(y~x+z)
regression(m) <- x~z
distribution(m,~y+z) <- binomial.lvm("logit")
d <- sim(m,1e3)
head(d)
#>   y          x z
#> 1 0 -1.3821237 0
#> 2 1  0.1592860 0
#> 3 0 -0.2024402 0
#> 4 1 -0.5407706 0
#> 5 1  3.0592772 1
#> 6 0 -1.0871117 0

e <- estimate(m,d,estimator="glm")
e
#>              Estimate Std. Error  Z-value   P-value
#> Regressions:                                       
#>    y~x        1.16436    0.09460 12.30808    <1e-12
#>    y~z        0.92734    0.17162  5.40331 6.542e-08
#>     x~z       1.17538    0.06522 18.02299    <1e-12
#> Intercepts:                                        
#>    y         -0.10564    0.10227 -1.03300    0.3016
#>    x         -0.08681    0.04530 -1.91634   0.05532
#> Dispersion:                                        
#>    x          1.06572                              
## Simulate a few observation from estimated model
sim(e,n=5)
#>   y           x z
#> 1 0  0.09892573 1
#> 2 1  1.79234819 1
#> 3 1  1.39730452 1
#> 4 1 -0.34995267 0
#> 5 1  1.29523903 1

##################################################
## Poisson
##################################################
distribution(m,~y) <- poisson.lvm()
d <- sim(m,1e4,p=c(y=-1,"y~x"=2,z=1))
head(d)
#>    y          x z
#> 1 16  1.4257214 1
#> 2  0 -1.4682438 0
#> 3  1  0.1739810 1
#> 4  1  0.3735584 1
#> 5  4  0.1141325 1
#> 6  0 -1.1831437 0
estimate(m,d,estimator="glm")
#>                Estimate Std. Error    Z-value  P-value
#> Regressions:                                          
#>    y~x          1.99862    0.00167 1194.23410   <1e-12
#>    y~z          1.00920    0.01135   88.93651   <1e-12
#>     x~z         1.05711    0.02256   46.86270   <1e-12
#> Intercepts:                                           
#>    y           -1.00566    0.01169  -86.01586   <1e-12
#>    x           -0.04412    0.01927   -2.28899  0.02208
#> Dispersion:                                           
#>    x            1.00309                               
mean(d$z); lava:::expit(1)
#> [1] 0.7254
#> [1] 0.7310586

summary(lm(y~x,sim(lvm(y[1:2]~4*x),1e3)))
#> 
#> Call:
#> lm(formula = y ~ x, data = sim(lvm(y[1:2] ~ 4 * x), 1000))
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -5.2750 -0.9708  0.0370  0.9322  5.3931 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  0.99595    0.04454   22.36   <2e-16 ***
#> x            4.00101    0.04458   89.74   <2e-16 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 1.407 on 998 degrees of freedom
#> Multiple R-squared:  0.8897,	Adjusted R-squared:  0.8896 
#> F-statistic:  8054 on 1 and 998 DF,  p-value: < 2.2e-16
#> 

##################################################
### Gamma distribution
##################################################
m <- lvm(y~x)
distribution(m,~y+x) <- list(Gamma.lvm(shape=2),binomial.lvm())
intercept(m,~y) <- 0.5
d <- sim(m,1e4)
summary(g <- glm(y~x,family=Gamma(),data=d))
#> 
#> Call:
#> glm(formula = y ~ x, family = Gamma(), data = d)
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 0.504608   0.005085   99.23   <2e-16 ***
#> x           1.014488   0.016342   62.08   <2e-16 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> (Dispersion parameter for Gamma family taken to be 0.5150804)
#> 
#>     Null deviance: 8427.8  on 9999  degrees of freedom
#> Residual deviance: 5548.4  on 9998  degrees of freedom
#> AIC: 20679
#> 
#> Number of Fisher Scoring iterations: 6
#> 
if (FALSE) MASS::gamma.shape(g) # \dontrun{}

args(lava::Gamma.lvm)
#> function (link = "inverse", shape, rate, unit = FALSE, var = FALSE, 
#>     log = FALSE, ...) 
#> NULL
distribution(m,~y) <- Gamma.lvm(shape=2,log=TRUE)
sim(m,10,p=c(y=0.5))[,"y"]
#>  [1]  2.1503587  0.5181377  0.5199346 -0.3664227  0.8646908  1.1661599
#>  [7]  0.0231124  1.7150221 -1.4569616 -0.0718344

##################################################
### Beta
##################################################
m <- lvm()
distribution(m,~y) <- beta.lvm(alpha=2,beta=1)
var(sim(m,100,"y,y"=2))
#>           y
#> y 0.9935516
distribution(m,~y) <- beta.lvm(alpha=2,beta=1,scale=FALSE)
var(sim(m,100))
#>            y
#> y 0.04904392

##################################################
### Transform
##################################################
m <- lvm()
transform(m,xz~x+z) <- function(x) x[1]*(x[2]>0)
regression(m) <- y~x+z+xz
d <- sim(m,1e3)
summary(lm(y~x+z + x*I(z>0),d))
#> 
#> Call:
#> lm(formula = y ~ x + z + x * I(z > 0), data = d)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -3.6065 -0.6391  0.0365  0.6927  3.0687 
#> 
#> Coefficients:
#>                 Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)    -0.017082   0.063323  -0.270    0.787    
#> x               1.026723   0.045408  22.611   <2e-16 ***
#> z               1.003452   0.054980  18.251   <2e-16 ***
#> I(z > 0)TRUE    0.002987   0.109813   0.027    0.978    
#> x:I(z > 0)TRUE  0.939556   0.063042  14.904   <2e-16 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 1.017 on 995 degrees of freedom
#> Multiple R-squared:  0.7794,	Adjusted R-squared:  0.7785 
#> F-statistic: 878.7 on 4 and 995 DF,  p-value: < 2.2e-16
#> 

##################################################
### Non-random variables
##################################################
m <- lvm()
distribution(m,~x+z+v+w) <- list(Sequence.lvm(0,5),## Seq. 0 to 5 by 1/n
                               Binary.lvm(),       ## Vector of ones
                               Binary.lvm(0.5),    ##  0.5n 0, 0.5n 1
                               Binary.lvm(interval=list(c(0.3,0.5),c(0.8,1))))
sim(m,10)
#>            x z v w
#> 1  0.0000000 1 0 0
#> 2  0.5555556 1 0 0
#> 3  1.1111111 1 0 1
#> 4  1.6666667 1 0 1
#> 5  2.2222222 1 0 1
#> 6  2.7777778 1 1 0
#> 7  3.3333333 1 1 0
#> 8  3.8888889 1 1 1
#> 9  4.4444444 1 1 1
#> 10 5.0000000 1 1 1

##################################################
### Cox model
### piecewise constant hazard
################################################
m <- lvm(t~x)
rates <- c(1,0.5); cuts <- c(0,5)
## Constant rate: 1 in [0,5), 0.5 in [5,Inf)
distribution(m,~t) <- coxExponential.lvm(rate=rates,timecut=cuts)

if (FALSE) { # \dontrun{
    d <- sim(m,2e4,p=c("t~x"=0.1)); d$status <- TRUE
    plot(timereg::aalen(survival::Surv(t,status)~x,data=d,
                        resample.iid=0,robust=0),spec=1)
    L <- approxfun(c(cuts,max(d$t)),f=1,
                   cumsum(c(0,rates*diff(c(cuts,max(d$t))))),
                   method="linear")
    curve(L,0,100,add=TRUE,col="blue")
} # }

##################################################
### Cox model
### piecewise constant hazard, gamma frailty
##################################################
m <- lvm(y~x+z)
rates <- c(0.3,0.5); cuts <- c(0,5)
distribution(m,~y+z) <- list(coxExponential.lvm(rate=rates,timecut=cuts),
                             loggamma.lvm(rate=1,shape=1))
if (FALSE) { # \dontrun{
    d <- sim(m,2e4,p=c("y~x"=0,"y~z"=0)); d$status <- TRUE
    plot(timereg::aalen(survival::Surv(y,status)~x,data=d,
                        resample.iid=0,robust=0),spec=1)
    L <- approxfun(c(cuts,max(d$y)),f=1,
                   cumsum(c(0,rates*diff(c(cuts,max(d$y))))),
                   method="linear")
    curve(L,0,100,add=TRUE,col="blue")
} # }
## Equivalent via transform (here with Aalens additive hazard model)
m <- lvm(y~x)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- Gamma.lvm(rate=1,shape=1)
transform(m,t~y+z) <- prod
sim(m,10)
#>              y           x          z            t
#> 1    0.3504609  3.07734178 0.80047627   0.28053563
#> 2  -64.6491771 -0.31848175 0.39379954 -25.45881626
#> 3    6.0457242 -0.08502363 0.63887608   3.86246860
#> 4    1.9675872  0.81133022 0.70311623   1.38344251
#> 5   -1.1417809 -0.53137860 0.90058416  -1.02826980
#> 6    0.4674614  1.00179887 0.82935702   0.38769237
#> 7    0.6922404  0.79769099 0.08787368   0.06082971
#> 8    0.9707728  0.32823279 1.53201476   1.48723828
#> 9   -0.7374005 -0.49151840 0.50868500  -0.37510457
#> 10   1.8513379  1.02202747 0.59051378   1.09324054
## Shared frailty
m <- lvm(c(t1,t2)~x+z)
rates <- c(1,0.5); cuts <- c(0,5)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- loggamma.lvm(rate=1,shape=1)
if (FALSE) { # \dontrun{
mets::fast.reshape(sim(m,100),varying="t")
} # }

##################################################
### General multivariate distributions
##################################################
if (FALSE) { # \dontrun{
m <- lvm()
distribution(m,~y1+y2,oratio=4) <- VGAM::rbiplackcop
ksmooth2(sim(m,1e4),rgl=FALSE,theta=-20,phi=25)

m <- lvm()
distribution(m,~z1+z2,"or1") <- VGAM::rbiplackcop
distribution(m,~y1+y2,"or2") <- VGAM::rbiplackcop
sim(m,10,p=c(or1=0.1,or2=4))
} # }

m <- lvm()
distribution(m,~y1+y2+y3,TRUE) <- function(n,...) rmvn0(n,sigma=diag(3)+1)
var(sim(m,100))
#>           y1        y2        y3
#> y1 1.6549112 0.6228623 0.5262932
#> y2 0.6228623 1.5185498 0.7571519
#> y3 0.5262932 0.7571519 1.6762197

## Syntax also useful for univariate generators, e.g.
m <- lvm(y~x+z)
distribution(m,~y,TRUE) <- function(n) rnorm(n,mean=1000)
sim(m,5)
#>           y          x          z
#> 1 1001.3455  0.5818875 -0.3013739
#> 2 1002.1450 -0.1350401  1.2848582
#> 3 1000.0822 -1.0407206  1.5686740
#> 4  994.5267 -2.3053292 -1.8824933
#> 5  998.4568 -1.3360425 -0.3062621
distribution(m,~y,"m1",0) <- rnorm
sim(m,5)
#>            y          x          z
#> 1 -0.5396791  0.3417136 -1.6480163
#> 2 -3.6095991 -1.6256374 -0.9574870
#> 3  2.1967654 -0.4768969  2.2250715
#> 4 -0.3940713 -0.6209352 -0.6273853
#> 5 -0.7275340  0.5040422  0.3378076
sim(m,5,p=c(m1=100))
#>           y          x          z
#> 1 102.18755  0.1787579  0.5007453
#> 2  98.04506  0.8271937 -1.8126868
#> 3  96.84215 -0.7196575 -1.4649504
#> 4 101.46870  1.6884779 -0.5837411
#> 5  95.68423 -1.1575305 -1.1663874

##################################################
### Regression design in other parameters
##################################################
## Variance heterogeneity
m <- lvm(y~x)
distribution(m,~y) <- function(n,mean,x) rnorm(n,mean,exp(x)^.5)
if (interactive()) plot(y~x,sim(m,1e3))
## Alternaively, calculate the standard error directly
addvar(m) <- ~sd ## If 'sd' should be part of the resulting data.frame
constrain(m,sd~x) <- function(x) exp(x)^.5
distribution(m,~y) <- function(n,mean,sd) rnorm(n,mean,sd)
if (interactive()) plot(y~x,sim(m,1e3))

## Regression on variance parameter
m <- lvm()
regression(m) <- y~x
regression(m) <- v~x
##distribution(m,~v) <- 0 # No stochastic term
## Alternative:
## regression(m) <- v[NA:0]~x
distribution(m,~y) <- function(n,mean,v) rnorm(n,mean,exp(v)^.5)
if (interactive()) plot(y~x,sim(m,1e3))

## Regression on shape parameter in Weibull model
m <- lvm()
regression(m) <- y ~ z+v
regression(m) <- s ~ exp(0.6*x-0.5*z)
distribution(m,~x+z) <- binomial.lvm()
distribution(m,~cens) <- coxWeibull.lvm(scale=1)
distribution(m,~y) <- coxWeibull.lvm(scale=0.1,shape=~s)
eventTime(m) <- time ~ min(y=1,cens=0)

if (interactive()) {
    d <- sim(m,1e3)
    require(survival)
    (cc <- coxph(Surv(time,status)~v+strata(x,z),data=d))
    plot(survfit(cc) ,col=1:4,mark.time=FALSE)
}

##################################################
### Categorical predictor
##################################################
m <- lvm()
## categorical(m,K=3) <- "v"
categorical(m,labels=c("A","B","C")) <- "v"

regression(m,additive=FALSE) <- y~v
if (FALSE) { # \dontrun{
plot(y~v,sim(m,1000,p=c("y~v:2"=3)))
} # }

m <- lvm()
categorical(m,labels=c("A","B","C"),p=c(0.5,0.3)) <- "v"
regression(m,additive=FALSE,beta=c(0,2,-1)) <- y~v
## equivalent to:
## regression(m,y~v,additive=FALSE) <- c(0,2,-1)
regression(m,additive=FALSE,beta=c(0,4,-1)) <- z~v
table(sim(m,1e4)$v)
#> 
#>    A    B    C 
#> 5000 3051 1949 
glm(y~v, data=sim(m,1e4))
#> 
#> Call:  glm(formula = y ~ v, data = sim(m, 10000))
#> 
#> Coefficients:
#> (Intercept)           vB           vC  
#>   -0.003759     2.032846    -0.983280  
#> 
#> Degrees of Freedom: 9999 Total (i.e. Null);  9997 Residual
#> Null Deviance:	    22410 
#> Residual Deviance: 9817 	AIC: 28200
glm(y~v, data=sim(m,1e4,p=c("y~v:1"=3)))
#> 
#> Call:  glm(formula = y ~ v, data = sim(m, 10000, p = c(`y~v:1` = 3)))
#> 
#> Coefficients:
#> (Intercept)           vB           vC  
#>   -0.001579     3.033921    -0.988032  
#> 
#> Degrees of Freedom: 9999 Total (i.e. Null);  9997 Residual
#> Null Deviance:	    34150 
#> Residual Deviance: 9792 	AIC: 28180

transform(m,v2~v) <- function(x) x=='A'
sim(m,10)
#>    v          y          z    v2
#> 1  B  2.7225014  4.3471577 FALSE
#> 2  B  3.4227940  2.6654917 FALSE
#> 3  A -0.7085905  2.1200748  TRUE
#> 4  C -1.4228910 -2.3743813 FALSE
#> 5  B  3.7990113  2.3934600 FALSE
#> 6  A  0.7750815 -0.3692173  TRUE
#> 7  C  0.1102358 -0.8214093 FALSE
#> 8  B  1.5153310  6.1654183 FALSE
#> 9  B  2.8497096  5.6700463 FALSE
#> 10 B  2.0529501  4.3634553 FALSE

##################################################
### Pre-calculate object
##################################################
m <- lvm(y~x)
m2 <- sim(m,'y~x'=2)
sim(m,10,'y~x'=2)
#>             y          x
#> 1  -3.1798033 -1.4756611
#> 2   0.7071417  1.0002274
#> 3   2.5909554  1.4401825
#> 4   4.2044588  2.1870327
#> 5  -1.2835247 -0.3542078
#> 6  -0.6652916  0.1662725
#> 7   4.3367737  0.5560635
#> 8  -2.3096870 -1.3264723
#> 9   1.0442205  0.1227436
#> 10 -2.4180767 -0.5474568
sim(m2,10) ## Faster
#>             y           x
#> 1   0.5160272 -0.32499194
#> 2   1.9639069  1.44766411
#> 3   0.8528626 -0.22711638
#> 4   1.3121923  1.50961388
#> 5   1.0176849  0.14595064
#> 6  -3.8403860 -1.48995665
#> 7   0.9179312 -0.13233502
#> 8   0.8011242  0.01558654
#> 9   2.1012753  1.01420451
#> 10 -1.4526002 -1.29120883