Skip to contents

Simulate data from a general SEM model including non-linear effects and general link and distribution of variables.

Usage

# S3 method for class 'lvm'
sim(x, n = NULL, p = NULL, normal = FALSE, cond = FALSE,
sigma = 1, rho = 0.5, X = NULL, unlink=FALSE, latent=TRUE,
use.labels = TRUE, seed=NULL, ...)

Arguments

x

Model object

n

Number of simulated values/individuals

p

Parameter value (optional)

normal

Logical indicating whether to simulate data from a multivariate normal distribution conditional on exogenous variables hence ignoring functional/distribution definition

cond

for internal use

sigma

Default residual variance (1)

rho

Default covariance parameter (0.5)

X

Optional matrix of fixed values of variables (manipulation)

Return Inverse link transformed data

latent

Include latent variables (default TRUE)

use.labels

convert categorical variables to factors before applying transformation

seed

Random seed

...

Additional arguments to be passed to the low level functions

Author

Klaus K. Holst

Examples

##################################################
## Logistic regression
##################################################
m <- lvm(y~x+z)
regression(m) <- x~z
distribution(m,~y+z) <- binomial.lvm("logit")
d <- sim(m,1e3)
head(d)
#>   y          x z
#> 1 1 -0.3890521 0
#> 2 1  1.5760916 1
#> 3 1  1.5709196 1
#> 4 1  1.4971902 1
#> 5 1  1.3515534 1
#> 6 1  0.6453046 0

e <- estimate(m,d,estimator="glm")
e
#>              Estimate Std. Error  Z-value   P-value
#> Regressions:                                       
#>    y~x        1.06972    0.09349 11.44259    <1e-12
#>    y~z        1.01578    0.16776  6.05499 1.404e-09
#>     x~z       0.96477    0.06532 14.77041    <1e-12
#> Intercepts:                                        
#>    y         -0.13044    0.10078 -1.29431    0.1956
#>    x          0.02210    0.04701  0.47012    0.6383
#> Dispersion:                                        
#>    x          1.06874                              
## Simulate a few observation from estimated model
sim(e,n=5)
#>   y         x z
#> 1 1 0.7578086 1
#> 2 0 0.6592641 1
#> 3 1 1.2263657 1
#> 4 1 0.2300801 0
#> 5 1 0.9931559 1

##################################################
## Poisson
##################################################
distribution(m,~y) <- poisson.lvm()
d <- sim(m,1e4,p=c(y=-1,"y~x"=2,z=1))
head(d)
#>    y           x z
#> 1 14  1.01864520 1
#> 2  1  0.09862234 0
#> 3  0 -0.28820655 0
#> 4 19  1.53349369 1
#> 5  0  0.46584650 0
#> 6  6  1.12191083 1
estimate(m,d,estimator="glm")
#>                Estimate Std. Error    Z-value  P-value
#> Regressions:                                          
#>    y~x          1.99880    0.00192 1042.74787   <1e-12
#>    y~z          0.98743    0.01116   88.49805   <1e-12
#>     x~z         0.99262    0.02273   43.66784   <1e-12
#> Intercepts:                                           
#>    y           -0.98504    0.01167  -84.44281   <1e-12
#>    x            0.00358    0.01950    0.18345   0.8544
#> Dispersion:                                           
#>    x            1.00426                               
mean(d$z); lava:::expit(1)
#> [1] 0.7251
#> [1] 0.7310586

summary(lm(y~x,sim(lvm(y[1:2]~4*x),1e3)))
#> 
#> Call:
#> lm(formula = y ~ x, data = sim(lvm(y[1:2] ~ 4 * x), 1000))
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -4.2216 -1.0570  0.0261  0.9492  4.5983 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  0.99738    0.04548   21.93   <2e-16 ***
#> x            3.99724    0.04426   90.31   <2e-16 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 1.438 on 998 degrees of freedom
#> Multiple R-squared:  0.891,	Adjusted R-squared:  0.8909 
#> F-statistic:  8156 on 1 and 998 DF,  p-value: < 2.2e-16
#> 

##################################################
### Gamma distribution
##################################################
m <- lvm(y~x)
distribution(m,~y+x) <- list(Gamma.lvm(shape=2),binomial.lvm())
intercept(m,~y) <- 0.5
d <- sim(m,1e4)
summary(g <- glm(y~x,family=Gamma(),data=d))
#> 
#> Call:
#> glm(formula = y ~ x, family = Gamma(), data = d)
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 0.505874   0.005102   99.15   <2e-16 ***
#> x           1.008180   0.016302   61.84   <2e-16 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> (Dispersion parameter for Gamma family taken to be 0.5156528)
#> 
#>     Null deviance: 8397.4  on 9999  degrees of freedom
#> Residual deviance: 5546.3  on 9998  degrees of freedom
#> AIC: 20678
#> 
#> Number of Fisher Scoring iterations: 6
#> 
if (FALSE) MASS::gamma.shape(g) # \dontrun{}

args(lava::Gamma.lvm)
#> function (link = "inverse", shape, rate, unit = FALSE, var = FALSE, 
#>     log = FALSE, ...) 
#> NULL
distribution(m,~y) <- Gamma.lvm(shape=2,log=TRUE)
sim(m,10,p=c(y=0.5))[,"y"]
#>  [1] -1.14993675  0.06110529 -0.78986992  1.68142084  0.13917097 -0.35847861
#>  [7]  0.73652104 -1.62401234 -0.39977265 -1.16007252

##################################################
### Beta
##################################################
m <- lvm()
distribution(m,~y) <- beta.lvm(alpha=2,beta=1)
var(sim(m,100,"y,y"=2))
#>         y
#> y 1.14993
distribution(m,~y) <- beta.lvm(alpha=2,beta=1,scale=FALSE)
var(sim(m,100))
#>           y
#> y 0.0521512

##################################################
### Transform
##################################################
m <- lvm()
transform(m,xz~x+z) <- function(x) x[1]*(x[2]>0)
regression(m) <- y~x+z+xz
d <- sim(m,1e3)
summary(lm(y~x+z + x*I(z>0),d))
#> 
#> Call:
#> lm(formula = y ~ x + z + x * I(z > 0), data = d)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -3.1528 -0.6738 -0.0486  0.7210  3.6658 
#> 
#> Coefficients:
#>                 Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)     0.021540   0.061655   0.349    0.727    
#> x               0.987755   0.047652  20.728   <2e-16 ***
#> z               1.016270   0.054549  18.630   <2e-16 ***
#> I(z > 0)TRUE   -0.005932   0.106185  -0.056    0.955    
#> x:I(z > 0)TRUE  1.009156   0.065653  15.371   <2e-16 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 1.018 on 995 degrees of freedom
#> Multiple R-squared:  0.7704,	Adjusted R-squared:  0.7694 
#> F-statistic: 834.5 on 4 and 995 DF,  p-value: < 2.2e-16
#> 

##################################################
### Non-random variables
##################################################
m <- lvm()
distribution(m,~x+z+v+w) <- list(Sequence.lvm(0,5),## Seq. 0 to 5 by 1/n
                               Binary.lvm(),       ## Vector of ones
                               Binary.lvm(0.5),    ##  0.5n 0, 0.5n 1
                               Binary.lvm(interval=list(c(0.3,0.5),c(0.8,1))))
sim(m,10)
#>            x z v w
#> 1  0.0000000 1 0 0
#> 2  0.5555556 1 0 0
#> 3  1.1111111 1 0 1
#> 4  1.6666667 1 0 1
#> 5  2.2222222 1 0 1
#> 6  2.7777778 1 1 0
#> 7  3.3333333 1 1 0
#> 8  3.8888889 1 1 1
#> 9  4.4444444 1 1 1
#> 10 5.0000000 1 1 1

##################################################
### Cox model
### piecewise constant hazard
################################################
m <- lvm(t~x)
rates <- c(1,0.5); cuts <- c(0,5)
## Constant rate: 1 in [0,5), 0.5 in [5,Inf)
distribution(m,~t) <- coxExponential.lvm(rate=rates,timecut=cuts)

if (FALSE) { # \dontrun{
    d <- sim(m,2e4,p=c("t~x"=0.1)); d$status <- TRUE
    plot(timereg::aalen(survival::Surv(t,status)~x,data=d,
                        resample.iid=0,robust=0),spec=1)
    L <- approxfun(c(cuts,max(d$t)),f=1,
                   cumsum(c(0,rates*diff(c(cuts,max(d$t))))),
                   method="linear")
    curve(L,0,100,add=TRUE,col="blue")
} # }

##################################################
### Cox model
### piecewise constant hazard, gamma frailty
##################################################
m <- lvm(y~x+z)
rates <- c(0.3,0.5); cuts <- c(0,5)
distribution(m,~y+z) <- list(coxExponential.lvm(rate=rates,timecut=cuts),
                             loggamma.lvm(rate=1,shape=1))
if (FALSE) { # \dontrun{
    d <- sim(m,2e4,p=c("y~x"=0,"y~z"=0)); d$status <- TRUE
    plot(timereg::aalen(survival::Surv(y,status)~x,data=d,
                        resample.iid=0,robust=0),spec=1)
    L <- approxfun(c(cuts,max(d$y)),f=1,
                   cumsum(c(0,rates*diff(c(cuts,max(d$y))))),
                   method="linear")
    curve(L,0,100,add=TRUE,col="blue")
} # }
## Equivalent via transform (here with Aalens additive hazard model)
m <- lvm(y~x)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- Gamma.lvm(rate=1,shape=1)
transform(m,t~y+z) <- prod
sim(m,10)
#>              y          x            z             t
#> 1  -7.52089416 -0.3975626 0.0521017141 -3.918515e-01
#> 2   2.20571704  0.3470455 0.5086850008  1.122015e+00
#> 3  -4.36804686 -0.3792286 0.5905137765 -2.579392e+00
#> 4  -0.07640396 -1.5337391 0.2058743050 -1.572961e-02
#> 5  21.43594787 -0.2640809 0.0099792760  2.139152e-01
#> 6   4.25372148  0.1839688 0.4443781625  1.890261e+00
#> 7  -0.09900900 -1.4246557 0.0006665787 -6.599729e-05
#> 8  -1.15906345 -1.0335661 0.8045995309 -9.325819e-01
#> 9   6.72402679 -0.2305547 0.6546673730  4.402001e+00
#> 10  5.53307450 -0.1962768 0.2215395973  1.225795e+00
## Shared frailty
m <- lvm(c(t1,t2)~x+z)
rates <- c(1,0.5); cuts <- c(0,5)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- loggamma.lvm(rate=1,shape=1)
if (FALSE) { # \dontrun{
mets::fast.reshape(sim(m,100),varying="t")
} # }

##################################################
### General multivariate distributions
##################################################
if (FALSE) { # \dontrun{
m <- lvm()
distribution(m,~y1+y2,oratio=4) <- VGAM::rbiplackcop
ksmooth2(sim(m,1e4),rgl=FALSE,theta=-20,phi=25)

m <- lvm()
distribution(m,~z1+z2,"or1") <- VGAM::rbiplackcop
distribution(m,~y1+y2,"or2") <- VGAM::rbiplackcop
sim(m,10,p=c(or1=0.1,or2=4))
} # }

m <- lvm()
distribution(m,~y1+y2+y3,TRUE) <- function(n,...) rmvn0(n,sigma=diag(3)+1)
var(sim(m,100))
#>          y1        y2        y3
#> y1 2.546658 1.2446886 1.1131815
#> y2 1.244689 2.1601392 0.8173183
#> y3 1.113182 0.8173183 1.8257718

## Syntax also useful for univariate generators, e.g.
m <- lvm(y~x+z)
distribution(m,~y,TRUE) <- function(n) rnorm(n,mean=1000)
sim(m,5)
#>           y          x           z
#> 1 1000.5214  0.2177780  1.93296840
#> 2  999.1417 -0.4748423  0.10528753
#> 3  999.1108 -2.2866782 -0.09999915
#> 4  998.8764 -0.7593130 -0.13847376
#> 5 1001.0689 -0.3586061  0.24695597
distribution(m,~y,"m1",0) <- rnorm
sim(m,5)
#>            y          x          z
#> 1 -0.5899834  0.5347800 -2.3598938
#> 2 -0.4243369 -0.9962041 -0.5107305
#> 3  3.5456795  1.7645527  1.5297775
#> 4 -1.4880155 -0.7726338 -0.1372156
#> 5 -0.2474045  0.3482477 -0.1443716
sim(m,5,p=c(m1=100))
#>           y            x          z
#> 1 102.25740  0.129915309  2.7875521
#> 2 101.53418  1.820998862 -0.3153138
#> 3  98.71054 -0.007577379 -0.2353492
#> 4  99.17310 -0.901698372 -0.4610830
#> 5 103.29305  0.072049531  1.9003038

##################################################
### Regression design in other parameters
##################################################
## Variance heterogeneity
m <- lvm(y~x)
distribution(m,~y) <- function(n,mean,x) rnorm(n,mean,exp(x)^.5)
if (interactive()) plot(y~x,sim(m,1e3))
## Alternaively, calculate the standard error directly
addvar(m) <- ~sd ## If 'sd' should be part of the resulting data.frame
constrain(m,sd~x) <- function(x) exp(x)^.5
distribution(m,~y) <- function(n,mean,sd) rnorm(n,mean,sd)
if (interactive()) plot(y~x,sim(m,1e3))

## Regression on variance parameter
m <- lvm()
regression(m) <- y~x
regression(m) <- v~x
##distribution(m,~v) <- 0 # No stochastic term
## Alternative:
## regression(m) <- v[NA:0]~x
distribution(m,~y) <- function(n,mean,v) rnorm(n,mean,exp(v)^.5)
if (interactive()) plot(y~x,sim(m,1e3))

## Regression on shape parameter in Weibull model
m <- lvm()
regression(m) <- y ~ z+v
regression(m) <- s ~ exp(0.6*x-0.5*z)
distribution(m,~x+z) <- binomial.lvm()
distribution(m,~cens) <- coxWeibull.lvm(scale=1)
distribution(m,~y) <- coxWeibull.lvm(scale=0.1,shape=~s)
eventTime(m) <- time ~ min(y=1,cens=0)

if (interactive()) {
    d <- sim(m,1e3)
    require(survival)
    (cc <- coxph(Surv(time,status)~v+strata(x,z),data=d))
    plot(survfit(cc) ,col=1:4,mark.time=FALSE)
}

##################################################
### Categorical predictor
##################################################
m <- lvm()
## categorical(m,K=3) <- "v"
categorical(m,labels=c("A","B","C")) <- "v"

regression(m,additive=FALSE) <- y~v
if (FALSE) { # \dontrun{
plot(y~v,sim(m,1000,p=c("y~v:2"=3)))
} # }

m <- lvm()
categorical(m,labels=c("A","B","C"),p=c(0.5,0.3)) <- "v"
regression(m,additive=FALSE,beta=c(0,2,-1)) <- y~v
## equivalent to:
## regression(m,y~v,additive=FALSE) <- c(0,2,-1)
regression(m,additive=FALSE,beta=c(0,4,-1)) <- z~v
table(sim(m,1e4)$v)
#> 
#>    A    B    C 
#> 5004 3018 1978 
glm(y~v, data=sim(m,1e4))
#> 
#> Call:  glm(formula = y ~ v, data = sim(m, 10000))
#> 
#> Coefficients:
#> (Intercept)           vB           vC  
#>     0.02492      1.99774     -1.00229  
#> 
#> Degrees of Freedom: 9999 Total (i.e. Null);  9997 Residual
#> Null Deviance:	    22580 
#> Residual Deviance: 9997 	AIC: 28380
glm(y~v, data=sim(m,1e4,p=c("y~v:1"=3)))
#> 
#> Call:  glm(formula = y ~ v, data = sim(m, 10000, p = c(`y~v:1` = 3)))
#> 
#> Coefficients:
#> (Intercept)           vB           vC  
#>    0.003437     2.987178    -1.009686  
#> 
#> Degrees of Freedom: 9999 Total (i.e. Null);  9997 Residual
#> Null Deviance:	    34490 
#> Residual Deviance: 10330 	AIC: 28710

transform(m,v2~v) <- function(x) x=='A'
sim(m,10)
#>    v          y          z    v2
#> 1  B  2.2298156  4.1679282 FALSE
#> 2  B  0.6586923  3.7329303 FALSE
#> 3  B  2.6336803  2.2362200 FALSE
#> 4  C -1.2815957 -3.2806097 FALSE
#> 5  B  1.7904833  1.7922099 FALSE
#> 6  A  1.5817981 -2.4106877  TRUE
#> 7  A  0.9435654  0.8197093  TRUE
#> 8  A  0.5989207  0.9182792  TRUE
#> 9  C -0.5337450  0.2826166 FALSE
#> 10 A  0.7108434 -1.3749699  TRUE

##################################################
### Pre-calculate object
##################################################
m <- lvm(y~x)
m2 <- sim(m,'y~x'=2)
sim(m,10,'y~x'=2)
#>             y          x
#> 1  -2.9969686 -0.5822500
#> 2   2.0932122  0.4675629
#> 3   2.4677225  1.1228049
#> 4   1.3229557  1.3806512
#> 5   3.1524416  1.9010546
#> 6  -0.6734257 -0.6149359
#> 7   0.4034541 -0.4402311
#> 8   0.5278260 -0.2312745
#> 9   1.3184855  0.7196587
#> 10 -0.6764807 -0.2205306
sim(m2,10) ## Faster
#>             y          x
#> 1   0.2093413 -0.1414400
#> 2   1.7423583  0.9679083
#> 3   2.3735490  0.8827087
#> 4  -2.9847372 -1.3114255
#> 5   0.8275647 -0.2983199
#> 6  -3.4592201 -1.8890341
#> 7   2.4341144  0.9988011
#> 8  -1.1049947 -0.4407522
#> 9  -0.4269766 -0.1266534
#> 10 -1.4520110 -0.6076527