Simulate data from a general SEM model including non-linear effects and general link and distribution of variables.

# S3 method for lvm
sim(x, n = NULL, p = NULL, normal = FALSE, cond = FALSE,
sigma = 1, rho = 0.5, X = NULL, unlink=FALSE, latent=TRUE,
use.labels = TRUE, seed=NULL, ...)

Arguments

x

Model object

...

Additional arguments to be passed to the low level functions

n

Number of simulated values/individuals

p

Parameter value (optional)

normal

Logical indicating whether to simulate data from a multivariate normal distribution conditional on exogenous variables hence ignoring functional/distribution definition

cond

for internal use

sigma

Default residual variance (1)

rho

Default covariance parameter (0.5)

X

Optional matrix of fixed values of variables (manipulation)

unlink

Return Inverse link transformed data

latent

Include latent variables (default TRUE)

use.labels

convert categorical variables to factors before applying transformation

seed

Random seed

Author

Klaus K. Holst

Examples

##################################################
## Logistic regression
##################################################
m <- lvm(y~x+z)
regression(m) <- x~z
distribution(m,~y+z) <- binomial.lvm("logit")
d <- sim(m,1e3)
head(d)
#>   y          x z
#> 1 0 -1.3821237 0
#> 2 1  0.1592860 0
#> 3 0 -0.2024402 0
#> 4 1 -0.5407706 0
#> 5 1  3.0592772 1
#> 6 0 -1.0871117 0

e <- estimate(m,d,estimator="glm")
e
#>              Estimate Std. Error  Z-value   P-value
#> Regressions:                                       
#>    y~x        1.16436    0.09460 12.30808    <1e-12
#>    y~z        0.92734    0.17162  5.40331 6.542e-08
#>     x~z       1.17538    0.06522 18.02299    <1e-12
#> Intercepts:                                        
#>    y         -0.10564    0.10227 -1.03300    0.3016
#>    x         -0.08681    0.04530 -1.91634   0.05532
#> Dispersion:                                        
#>    x          1.06572                              
## Simulate a few observation from estimated model
sim(e,n=5)
#>   y           x z
#> 1 0  0.09892573 1
#> 2 1  1.79234819 1
#> 3 1  1.39730452 1
#> 4 1 -0.34995267 0
#> 5 1  1.29523903 1

##################################################
## Poisson
##################################################
distribution(m,~y) <- poisson.lvm()
d <- sim(m,1e4,p=c(y=-1,"y~x"=2,z=1))
head(d)
#>    y          x z
#> 1 16  1.4257214 1
#> 2  0 -1.4682438 0
#> 3  1  0.1739810 1
#> 4  1  0.3735584 1
#> 5  4  0.1141325 1
#> 6  0 -1.1831437 0
estimate(m,d,estimator="glm")
#>                Estimate Std. Error    Z-value  P-value
#> Regressions:                                          
#>    y~x          1.99862    0.00167 1194.23410   <1e-12
#>    y~z          1.00920    0.01135   88.93651   <1e-12
#>     x~z         1.05711    0.02256   46.86270   <1e-12
#> Intercepts:                                           
#>    y           -1.00566    0.01169  -86.01586   <1e-12
#>    x           -0.04412    0.01927   -2.28899  0.02208
#> Dispersion:                                           
#>    x            1.00309                               
mean(d$z); lava:::expit(1)
#> [1] 0.7254
#> [1] 0.7310586

summary(lm(y~x,sim(lvm(y[1:2]~4*x),1e3)))
#> 
#> Call:
#> lm(formula = y ~ x, data = sim(lvm(y[1:2] ~ 4 * x), 1000))
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -5.2750 -0.9708  0.0370  0.9322  5.3931 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  0.99595    0.04454   22.36   <2e-16 ***
#> x            4.00101    0.04458   89.74   <2e-16 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 1.407 on 998 degrees of freedom
#> Multiple R-squared:  0.8897,	Adjusted R-squared:  0.8896 
#> F-statistic:  8054 on 1 and 998 DF,  p-value: < 2.2e-16
#> 

##################################################
### Gamma distribution
##################################################
m <- lvm(y~x)
distribution(m,~y+x) <- list(Gamma.lvm(shape=2),binomial.lvm())
intercept(m,~y) <- 0.5
d <- sim(m,1e4)
summary(g <- glm(y~x,family=Gamma(),data=d))
#> 
#> Call:
#> glm(formula = y ~ x, family = Gamma(), data = d)
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 0.504608   0.005085   99.23   <2e-16 ***
#> x           1.014488   0.016342   62.08   <2e-16 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> (Dispersion parameter for Gamma family taken to be 0.5150804)
#> 
#>     Null deviance: 8427.8  on 9999  degrees of freedom
#> Residual deviance: 5548.4  on 9998  degrees of freedom
#> AIC: 20679
#> 
#> Number of Fisher Scoring iterations: 6
#> 
if (FALSE) MASS::gamma.shape(g)

args(lava::Gamma.lvm)
#> function (link = "inverse", shape, rate, unit = FALSE, var = FALSE, 
#>     log = FALSE, ...) 
#> NULL
distribution(m,~y) <- Gamma.lvm(shape=2,log=TRUE)
sim(m,10,p=c(y=0.5))[,"y"]
#>  [1]  2.1503587  0.5181377  0.5199346 -0.3664227  0.8646908  1.1661599
#>  [7]  0.0231124  1.7150221 -1.4569616 -0.0718344

##################################################
### Beta
##################################################
m <- lvm()
distribution(m,~y) <- beta.lvm(alpha=2,beta=1)
var(sim(m,100,"y,y"=2))
#>           y
#> y 0.9935516
distribution(m,~y) <- beta.lvm(alpha=2,beta=1,scale=FALSE)
var(sim(m,100))
#>            y
#> y 0.04904392

##################################################
### Transform
##################################################
m <- lvm()
transform(m,xz~x+z) <- function(x) x[1]*(x[2]>0)
regression(m) <- y~x+z+xz
d <- sim(m,1e3)
summary(lm(y~x+z + x*I(z>0),d))
#> 
#> Call:
#> lm(formula = y ~ x + z + x * I(z > 0), data = d)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -3.6065 -0.6391  0.0365  0.6927  3.0687 
#> 
#> Coefficients:
#>                 Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)    -0.017082   0.063323  -0.270    0.787    
#> x               1.026723   0.045408  22.611   <2e-16 ***
#> z               1.003452   0.054980  18.251   <2e-16 ***
#> I(z > 0)TRUE    0.002987   0.109813   0.027    0.978    
#> x:I(z > 0)TRUE  0.939556   0.063042  14.904   <2e-16 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 1.017 on 995 degrees of freedom
#> Multiple R-squared:  0.7794,	Adjusted R-squared:  0.7785 
#> F-statistic: 878.7 on 4 and 995 DF,  p-value: < 2.2e-16
#> 

##################################################
### Non-random variables
##################################################
m <- lvm()
distribution(m,~x+z+v+w) <- list(Sequence.lvm(0,5),## Seq. 0 to 5 by 1/n
                               Binary.lvm(),       ## Vector of ones
                               Binary.lvm(0.5),    ##  0.5n 0, 0.5n 1
                               Binary.lvm(interval=list(c(0.3,0.5),c(0.8,1))))
sim(m,10)
#>            x z v w
#> 1  0.0000000 1 0 0
#> 2  0.5555556 1 0 0
#> 3  1.1111111 1 0 1
#> 4  1.6666667 1 0 1
#> 5  2.2222222 1 0 1
#> 6  2.7777778 1 1 0
#> 7  3.3333333 1 1 0
#> 8  3.8888889 1 1 1
#> 9  4.4444444 1 1 1
#> 10 5.0000000 1 1 1

##################################################
### Cox model
### piecewise constant hazard
################################################
m <- lvm(t~x)
rates <- c(1,0.5); cuts <- c(0,5)
## Constant rate: 1 in [0,5), 0.5 in [5,Inf)
distribution(m,~t) <- coxExponential.lvm(rate=rates,timecut=cuts)

if (FALSE) {
    d <- sim(m,2e4,p=c("t~x"=0.1)); d$status <- TRUE
    plot(timereg::aalen(survival::Surv(t,status)~x,data=d,
                        resample.iid=0,robust=0),spec=1)
    L <- approxfun(c(cuts,max(d$t)),f=1,
                   cumsum(c(0,rates*diff(c(cuts,max(d$t))))),
                   method="linear")
    curve(L,0,100,add=TRUE,col="blue")
}

##################################################
### Cox model
### piecewise constant hazard, gamma frailty
##################################################
m <- lvm(y~x+z)
rates <- c(0.3,0.5); cuts <- c(0,5)
distribution(m,~y+z) <- list(coxExponential.lvm(rate=rates,timecut=cuts),
                             loggamma.lvm(rate=1,shape=1))
if (FALSE) {
    d <- sim(m,2e4,p=c("y~x"=0,"y~z"=0)); d$status <- TRUE
    plot(timereg::aalen(survival::Surv(y,status)~x,data=d,
                        resample.iid=0,robust=0),spec=1)
    L <- approxfun(c(cuts,max(d$y)),f=1,
                   cumsum(c(0,rates*diff(c(cuts,max(d$y))))),
                   method="linear")
    curve(L,0,100,add=TRUE,col="blue")
}
## Equivalent via transform (here with Aalens additive hazard model)
m <- lvm(y~x)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- Gamma.lvm(rate=1,shape=1)
transform(m,t~y+z) <- prod
sim(m,10)
#>              y           x          z            t
#> 1    0.3504609  3.07734178 0.80047627   0.28053563
#> 2  -64.6491771 -0.31848175 0.39379954 -25.45881626
#> 3    6.0457242 -0.08502363 0.63887608   3.86246860
#> 4    1.9675872  0.81133022 0.70311623   1.38344251
#> 5   -1.1417809 -0.53137860 0.90058416  -1.02826980
#> 6    0.4674614  1.00179887 0.82935702   0.38769237
#> 7    0.6922404  0.79769099 0.08787368   0.06082971
#> 8    0.9707728  0.32823279 1.53201476   1.48723828
#> 9   -0.7374005 -0.49151840 0.50868500  -0.37510457
#> 10   1.8513379  1.02202747 0.59051378   1.09324054
## Shared frailty
m <- lvm(c(t1,t2)~x+z)
rates <- c(1,0.5); cuts <- c(0,5)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- loggamma.lvm(rate=1,shape=1)
if (FALSE) {
mets::fast.reshape(sim(m,100),varying="t")
}

##################################################
### General multivariate distributions
##################################################
if (FALSE) {
m <- lvm()
distribution(m,~y1+y2,oratio=4) <- VGAM::rbiplackcop
ksmooth2(sim(m,1e4),rgl=FALSE,theta=-20,phi=25)

m <- lvm()
distribution(m,~z1+z2,"or1") <- VGAM::rbiplackcop
distribution(m,~y1+y2,"or2") <- VGAM::rbiplackcop
sim(m,10,p=c(or1=0.1,or2=4))
}

m <- lvm()
distribution(m,~y1+y2+y3,TRUE) <- function(n,...) rmvn0(n,sigma=diag(3)+1)
var(sim(m,100))
#>           y1        y2        y3
#> y1 1.6549112 0.6228623 0.5262932
#> y2 0.6228623 1.5185498 0.7571519
#> y3 0.5262932 0.7571519 1.6762197

## Syntax also useful for univariate generators, e.g.
m <- lvm(y~x+z)
distribution(m,~y,TRUE) <- function(n) rnorm(n,mean=1000)
sim(m,5)
#>           y          x          z
#> 1 1001.3455  0.5818875 -0.3013739
#> 2 1002.1450 -0.1350401  1.2848582
#> 3 1000.0822 -1.0407206  1.5686740
#> 4  994.5267 -2.3053292 -1.8824933
#> 5  998.4568 -1.3360425 -0.3062621
distribution(m,~y,"m1",0) <- rnorm
sim(m,5)
#>            y          x          z
#> 1 -0.5396791  0.3417136 -1.6480163
#> 2 -3.6095991 -1.6256374 -0.9574870
#> 3  2.1967654 -0.4768969  2.2250715
#> 4 -0.3940713 -0.6209352 -0.6273853
#> 5 -0.7275340  0.5040422  0.3378076
sim(m,5,p=c(m1=100))
#>           y          x          z
#> 1 102.18755  0.1787579  0.5007453
#> 2  98.04506  0.8271937 -1.8126868
#> 3  96.84215 -0.7196575 -1.4649504
#> 4 101.46870  1.6884779 -0.5837411
#> 5  95.68423 -1.1575305 -1.1663874

##################################################
### Regression design in other parameters
##################################################
## Variance heterogeneity
m <- lvm(y~x)
distribution(m,~y) <- function(n,mean,x) rnorm(n,mean,exp(x)^.5)
if (interactive()) plot(y~x,sim(m,1e3))
## Alternaively, calculate the standard error directly
addvar(m) <- ~sd ## If 'sd' should be part of the resulting data.frame
constrain(m,sd~x) <- function(x) exp(x)^.5
distribution(m,~y) <- function(n,mean,sd) rnorm(n,mean,sd)
if (interactive()) plot(y~x,sim(m,1e3))

## Regression on variance parameter
m <- lvm()
regression(m) <- y~x
regression(m) <- v~x
##distribution(m,~v) <- 0 # No stochastic term
## Alternative:
## regression(m) <- v[NA:0]~x
distribution(m,~y) <- function(n,mean,v) rnorm(n,mean,exp(v)^.5)
if (interactive()) plot(y~x,sim(m,1e3))

## Regression on shape parameter in Weibull model
m <- lvm()
regression(m) <- y ~ z+v
regression(m) <- s ~ exp(0.6*x-0.5*z)
distribution(m,~x+z) <- binomial.lvm()
distribution(m,~cens) <- coxWeibull.lvm(scale=1)
distribution(m,~y) <- coxWeibull.lvm(scale=0.1,shape=~s)
eventTime(m) <- time ~ min(y=1,cens=0)

if (interactive()) {
    d <- sim(m,1e3)
    require(survival)
    (cc <- coxph(Surv(time,status)~v+strata(x,z),data=d))
    plot(survfit(cc) ,col=1:4,mark.time=FALSE)
}

##################################################
### Categorical predictor
##################################################
m <- lvm()
## categorical(m,K=3) <- "v"
categorical(m,labels=c("A","B","C")) <- "v"

regression(m,additive=FALSE) <- y~v
if (FALSE) {
plot(y~v,sim(m,1000,p=c("y~v:2"=3)))
}

m <- lvm()
categorical(m,labels=c("A","B","C"),p=c(0.5,0.3)) <- "v"
regression(m,additive=FALSE,beta=c(0,2,-1)) <- y~v
## equivalent to:
## regression(m,y~v,additive=FALSE) <- c(0,2,-1)
regression(m,additive=FALSE,beta=c(0,4,-1)) <- z~v
table(sim(m,1e4)$v)
#> 
#>    A    B    C 
#> 5000 3051 1949 
glm(y~v, data=sim(m,1e4))
#> 
#> Call:  glm(formula = y ~ v, data = sim(m, 10000))
#> 
#> Coefficients:
#> (Intercept)           vB           vC  
#>   -0.003759     2.032846    -0.983280  
#> 
#> Degrees of Freedom: 9999 Total (i.e. Null);  9997 Residual
#> Null Deviance:	    22410 
#> Residual Deviance: 9817 	AIC: 28200
glm(y~v, data=sim(m,1e4,p=c("y~v:1"=3)))
#> 
#> Call:  glm(formula = y ~ v, data = sim(m, 10000, p = c(`y~v:1` = 3)))
#> 
#> Coefficients:
#> (Intercept)           vB           vC  
#>   -0.001579     3.033921    -0.988032  
#> 
#> Degrees of Freedom: 9999 Total (i.e. Null);  9997 Residual
#> Null Deviance:	    34150 
#> Residual Deviance: 9792 	AIC: 28180

transform(m,v2~v) <- function(x) x=='A'
sim(m,10)
#>    v          y          z    v2
#> 1  B  2.7225014  4.3471577 FALSE
#> 2  B  3.4227940  2.6654917 FALSE
#> 3  A -0.7085905  2.1200748  TRUE
#> 4  C -1.4228910 -2.3743813 FALSE
#> 5  B  3.7990113  2.3934600 FALSE
#> 6  A  0.7750815 -0.3692173  TRUE
#> 7  C  0.1102358 -0.8214093 FALSE
#> 8  B  1.5153310  6.1654183 FALSE
#> 9  B  2.8497096  5.6700463 FALSE
#> 10 B  2.0529501  4.3634553 FALSE

##################################################
### Pre-calculate object
##################################################
m <- lvm(y~x)
m2 <- sim(m,'y~x'=2)
sim(m,10,'y~x'=2)
#>             y          x
#> 1  -3.1798033 -1.4756611
#> 2   0.7071417  1.0002274
#> 3   2.5909554  1.4401825
#> 4   4.2044588  2.1870327
#> 5  -1.2835247 -0.3542078
#> 6  -0.6652916  0.1662725
#> 7   4.3367737  0.5560635
#> 8  -2.3096870 -1.3264723
#> 9   1.0442205  0.1227436
#> 10 -2.4180767 -0.5474568
sim(m2,10) ## Faster
#>             y           x
#> 1   0.5160272 -0.32499194
#> 2   1.9639069  1.44766411
#> 3   0.8528626 -0.22711638
#> 4   1.3121923  1.50961388
#> 5   1.0176849  0.14595064
#> 6  -3.8403860 -1.48995665
#> 7   0.9179312 -0.13233502
#> 8   0.8011242  0.01558654
#> 9   2.1012753  1.01420451
#> 10 -1.4526002 -1.29120883