Simulate data from a general SEM model including non-linear effects and general link and distribution of variables.

# S3 method for lvm
sim(x, n = NULL, p = NULL, normal = FALSE, cond = FALSE,
sigma = 1, rho = 0.5, X = NULL, unlink=FALSE, latent=TRUE,
use.labels = TRUE, seed=NULL, ...)

## Arguments

x

Model object

...

Additional arguments to be passed to the low level functions

n

Number of simulated values/individuals

p

Parameter value (optional)

normal

Logical indicating whether to simulate data from a multivariate normal distribution conditional on exogenous variables hence ignoring functional/distribution definition

cond

for internal use

sigma

Default residual variance (1)

rho

Default covariance parameter (0.5)

X

Optional matrix of fixed values of variables (manipulation)

latent

Include latent variables (default TRUE)

use.labels

convert categorical variables to factors before applying transformation

seed

Random seed

Klaus K. Holst

## Examples

##################################################
## Logistic regression
##################################################
m <- lvm(y~x+z)
regression(m) <- x~z
distribution(m,~y+z) <- binomial.lvm("logit")
d <- sim(m,1e3)
#>   y          x z
#> 1 0 -1.3821237 0
#> 2 1  0.1592860 0
#> 3 0 -0.2024402 0
#> 4 1 -0.5407706 0
#> 5 1  3.0592772 1
#> 6 0 -1.0871117 0

e <- estimate(m,d,estimator="glm")
e
#>              Estimate Std. Error  Z-value   P-value
#> Regressions:
#>    y~x        1.16436    0.09460 12.30808    <1e-12
#>    y~z        0.92734    0.17162  5.40331 6.542e-08
#>     x~z       1.17538    0.06522 18.02299    <1e-12
#> Intercepts:
#>    y         -0.10564    0.10227 -1.03300    0.3016
#>    x         -0.08681    0.04530 -1.91634   0.05532
#> Dispersion:
#>    x          1.06572
## Simulate a few observation from estimated model
sim(e,n=5)
#>   y           x z
#> 1 0  0.09892573 1
#> 2 1  1.79234819 1
#> 3 1  1.39730452 1
#> 4 1 -0.34995267 0
#> 5 1  1.29523903 1

##################################################
## Poisson
##################################################
distribution(m,~y) <- poisson.lvm()
d <- sim(m,1e4,p=c(y=-1,"y~x"=2,z=1))
#>    y          x z
#> 1 16  1.4257214 1
#> 2  0 -1.4682438 0
#> 3  1  0.1739810 1
#> 4  1  0.3735584 1
#> 5  4  0.1141325 1
#> 6  0 -1.1831437 0
estimate(m,d,estimator="glm")
#>                Estimate Std. Error    Z-value  P-value
#> Regressions:
#>    y~x          1.99862    0.00167 1194.23410   <1e-12
#>    y~z          1.00920    0.01135   88.93651   <1e-12
#>     x~z         1.05711    0.02256   46.86270   <1e-12
#> Intercepts:
#>    y           -1.00566    0.01169  -86.01586   <1e-12
#>    x           -0.04412    0.01927   -2.28899  0.02208
#> Dispersion:
#>    x            1.00309
mean(d$z); lava:::expit(1) #> [1] 0.7254 #> [1] 0.7310586 summary(lm(y~x,sim(lvm(y[1:2]~4*x),1e3))) #> #> Call: #> lm(formula = y ~ x, data = sim(lvm(y[1:2] ~ 4 * x), 1000)) #> #> Residuals: #> Min 1Q Median 3Q Max #> -5.2750 -0.9708 0.0370 0.9322 5.3931 #> #> Coefficients: #> Estimate Std. Error t value Pr(>|t|) #> (Intercept) 0.99595 0.04454 22.36 <2e-16 *** #> x 4.00101 0.04458 89.74 <2e-16 *** #> --- #> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 #> #> Residual standard error: 1.407 on 998 degrees of freedom #> Multiple R-squared: 0.8897, Adjusted R-squared: 0.8896 #> F-statistic: 8054 on 1 and 998 DF, p-value: < 2.2e-16 #> ################################################## ### Gamma distribution ################################################## m <- lvm(y~x) distribution(m,~y+x) <- list(Gamma.lvm(shape=2),binomial.lvm()) intercept(m,~y) <- 0.5 d <- sim(m,1e4) summary(g <- glm(y~x,family=Gamma(),data=d)) #> #> Call: #> glm(formula = y ~ x, family = Gamma(), data = d) #> #> Deviance Residuals: #> Min 1Q Median 3Q Max #> -2.8389 -0.6569 -0.1657 0.3067 2.7257 #> #> Coefficients: #> Estimate Std. Error t value Pr(>|t|) #> (Intercept) 0.504608 0.005085 99.23 <2e-16 *** #> x 1.014488 0.016342 62.08 <2e-16 *** #> --- #> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 #> #> (Dispersion parameter for Gamma family taken to be 0.5150804) #> #> Null deviance: 8427.8 on 9999 degrees of freedom #> Residual deviance: 5548.4 on 9998 degrees of freedom #> AIC: 20679 #> #> Number of Fisher Scoring iterations: 6 #> if (FALSE) MASS::gamma.shape(g) args(lava::Gamma.lvm) #> function (link = "inverse", shape, rate, unit = FALSE, var = FALSE, #> log = FALSE, ...) #> NULL distribution(m,~y) <- Gamma.lvm(shape=2,log=TRUE) sim(m,10,p=c(y=0.5))[,"y"] #> [1] 2.1503587 0.5181377 0.5199346 -0.3664227 0.8646908 1.1661599 #> [7] 0.0231124 1.7150221 -1.4569616 -0.0718344 ################################################## ### Beta ################################################## m <- lvm() distribution(m,~y) <- beta.lvm(alpha=2,beta=1) var(sim(m,100,"y,y"=2)) #> y #> y 0.9935516 distribution(m,~y) <- beta.lvm(alpha=2,beta=1,scale=FALSE) var(sim(m,100)) #> y #> y 0.04904392 ################################################## ### Transform ################################################## m <- lvm() transform(m,xz~x+z) <- function(x) x[1]*(x[2]>0) regression(m) <- y~x+z+xz d <- sim(m,1e3) summary(lm(y~x+z + x*I(z>0),d)) #> #> Call: #> lm(formula = y ~ x + z + x * I(z > 0), data = d) #> #> Residuals: #> Min 1Q Median 3Q Max #> -3.6065 -0.6391 0.0365 0.6927 3.0687 #> #> Coefficients: #> Estimate Std. Error t value Pr(>|t|) #> (Intercept) -0.017082 0.063323 -0.270 0.787 #> x 1.026723 0.045408 22.611 <2e-16 *** #> z 1.003452 0.054980 18.251 <2e-16 *** #> I(z > 0)TRUE 0.002987 0.109813 0.027 0.978 #> x:I(z > 0)TRUE 0.939556 0.063042 14.904 <2e-16 *** #> --- #> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 #> #> Residual standard error: 1.017 on 995 degrees of freedom #> Multiple R-squared: 0.7794, Adjusted R-squared: 0.7785 #> F-statistic: 878.7 on 4 and 995 DF, p-value: < 2.2e-16 #> ################################################## ### Non-random variables ################################################## m <- lvm() distribution(m,~x+z+v+w) <- list(Sequence.lvm(0,5),## Seq. 0 to 5 by 1/n Binary.lvm(), ## Vector of ones Binary.lvm(0.5), ## 0.5n 0, 0.5n 1 Binary.lvm(interval=list(c(0.3,0.5),c(0.8,1)))) sim(m,10) #> x z v w #> 1 0.0000000 1 0 0 #> 2 0.5555556 1 0 0 #> 3 1.1111111 1 0 1 #> 4 1.6666667 1 0 1 #> 5 2.2222222 1 0 1 #> 6 2.7777778 1 1 0 #> 7 3.3333333 1 1 0 #> 8 3.8888889 1 1 1 #> 9 4.4444444 1 1 1 #> 10 5.0000000 1 1 1 ################################################## ### Cox model ### piecewise constant hazard ################################################ m <- lvm(t~x) rates <- c(1,0.5); cuts <- c(0,5) ## Constant rate: 1 in [0,5), 0.5 in [5,Inf) distribution(m,~t) <- coxExponential.lvm(rate=rates,timecut=cuts) if (FALSE) { d <- sim(m,2e4,p=c("t~x"=0.1)); d$status <- TRUE
plot(timereg::aalen(survival::Surv(t,status)~x,data=d,
resample.iid=0,robust=0),spec=1)
L <- approxfun(c(cuts,max(d$t)),f=1, cumsum(c(0,rates*diff(c(cuts,max(d$t))))),
method="linear")
}

##################################################
### Cox model
### piecewise constant hazard, gamma frailty
##################################################
m <- lvm(y~x+z)
rates <- c(0.3,0.5); cuts <- c(0,5)
distribution(m,~y+z) <- list(coxExponential.lvm(rate=rates,timecut=cuts),
loggamma.lvm(rate=1,shape=1))
if (FALSE) {
d <- sim(m,2e4,p=c("y~x"=0,"y~z"=0)); d$status <- TRUE plot(timereg::aalen(survival::Surv(y,status)~x,data=d, resample.iid=0,robust=0),spec=1) L <- approxfun(c(cuts,max(d$y)),f=1,
cumsum(c(0,rates*diff(c(cuts,max(d$y))))), method="linear") curve(L,0,100,add=TRUE,col="blue") } ## Equivalent via transform (here with Aalens additive hazard model) m <- lvm(y~x) distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts) distribution(m,~z) <- Gamma.lvm(rate=1,shape=1) transform(m,t~y+z) <- prod sim(m,10) #> y x z t #> 1 0.3504609 3.07734178 0.80047627 0.28053563 #> 2 -64.6491771 -0.31848175 0.39379954 -25.45881626 #> 3 6.0457242 -0.08502363 0.63887608 3.86246860 #> 4 1.9675872 0.81133022 0.70311623 1.38344251 #> 5 -1.1417809 -0.53137860 0.90058416 -1.02826980 #> 6 0.4674614 1.00179887 0.82935702 0.38769237 #> 7 0.6922404 0.79769099 0.08787368 0.06082971 #> 8 0.9707728 0.32823279 1.53201476 1.48723828 #> 9 -0.7374005 -0.49151840 0.50868500 -0.37510457 #> 10 1.8513379 1.02202747 0.59051378 1.09324054 ## Shared frailty m <- lvm(c(t1,t2)~x+z) rates <- c(1,0.5); cuts <- c(0,5) distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts) distribution(m,~z) <- loggamma.lvm(rate=1,shape=1) if (FALSE) { mets::fast.reshape(sim(m,100),varying="t") } ################################################## ### General multivariate distributions ################################################## if (FALSE) { m <- lvm() distribution(m,~y1+y2,oratio=4) <- VGAM::rbiplackcop ksmooth2(sim(m,1e4),rgl=FALSE,theta=-20,phi=25) m <- lvm() distribution(m,~z1+z2,"or1") <- VGAM::rbiplackcop distribution(m,~y1+y2,"or2") <- VGAM::rbiplackcop sim(m,10,p=c(or1=0.1,or2=4)) } m <- lvm() distribution(m,~y1+y2+y3,TRUE) <- function(n,...) rmvn0(n,sigma=diag(3)+1) var(sim(m,100)) #> y1 y2 y3 #> y1 1.6549112 0.6228623 0.5262932 #> y2 0.6228623 1.5185498 0.7571519 #> y3 0.5262932 0.7571519 1.6762197 ## Syntax also useful for univariate generators, e.g. m <- lvm(y~x+z) distribution(m,~y,TRUE) <- function(n) rnorm(n,mean=1000) sim(m,5) #> y x z #> 1 1001.3455 0.5818875 -0.3013739 #> 2 1002.1450 -0.1350401 1.2848582 #> 3 1000.0822 -1.0407206 1.5686740 #> 4 994.5267 -2.3053292 -1.8824933 #> 5 998.4568 -1.3360425 -0.3062621 distribution(m,~y,"m1",0) <- rnorm sim(m,5) #> y x z #> 1 -0.5396791 0.3417136 -1.6480163 #> 2 -3.6095991 -1.6256374 -0.9574870 #> 3 2.1967654 -0.4768969 2.2250715 #> 4 -0.3940713 -0.6209352 -0.6273853 #> 5 -0.7275340 0.5040422 0.3378076 sim(m,5,p=c(m1=100)) #> y x z #> 1 102.18755 0.1787579 0.5007453 #> 2 98.04506 0.8271937 -1.8126868 #> 3 96.84215 -0.7196575 -1.4649504 #> 4 101.46870 1.6884779 -0.5837411 #> 5 95.68423 -1.1575305 -1.1663874 ################################################## ### Regression design in other parameters ################################################## ## Variance heterogeneity m <- lvm(y~x) distribution(m,~y) <- function(n,mean,x) rnorm(n,mean,exp(x)^.5) if (interactive()) plot(y~x,sim(m,1e3)) ## Alternaively, calculate the standard error directly addvar(m) <- ~sd ## If 'sd' should be part of the resulting data.frame constrain(m,sd~x) <- function(x) exp(x)^.5 distribution(m,~y) <- function(n,mean,sd) rnorm(n,mean,sd) if (interactive()) plot(y~x,sim(m,1e3)) ## Regression on variance parameter m <- lvm() regression(m) <- y~x regression(m) <- v~x ##distribution(m,~v) <- 0 # No stochastic term ## Alternative: ## regression(m) <- v[NA:0]~x distribution(m,~y) <- function(n,mean,v) rnorm(n,mean,exp(v)^.5) if (interactive()) plot(y~x,sim(m,1e3)) ## Regression on shape parameter in Weibull model m <- lvm() regression(m) <- y ~ z+v regression(m) <- s ~ exp(0.6*x-0.5*z) distribution(m,~x+z) <- binomial.lvm() distribution(m,~cens) <- coxWeibull.lvm(scale=1) distribution(m,~y) <- coxWeibull.lvm(scale=0.1,shape=~s) eventTime(m) <- time ~ min(y=1,cens=0) if (interactive()) { d <- sim(m,1e3) require(survival) (cc <- coxph(Surv(time,status)~v+strata(x,z),data=d)) plot(survfit(cc) ,col=1:4,mark.time=FALSE) } ################################################## ### Categorical predictor ################################################## m <- lvm() ## categorical(m,K=3) <- "v" categorical(m,labels=c("A","B","C")) <- "v" regression(m,additive=FALSE) <- y~v if (FALSE) { plot(y~v,sim(m,1000,p=c("y~v:2"=3))) } m <- lvm() categorical(m,labels=c("A","B","C"),p=c(0.5,0.3)) <- "v" regression(m,additive=FALSE,beta=c(0,2,-1)) <- y~v ## equivalent to: ## regression(m,y~v,additive=FALSE) <- c(0,2,-1) regression(m,additive=FALSE,beta=c(0,4,-1)) <- z~v table(sim(m,1e4)$v)
#>
#>    A    B    C
#> 5000 3051 1949
glm(y~v, data=sim(m,1e4))
#>
#> Call:  glm(formula = y ~ v, data = sim(m, 10000))
#>
#> Coefficients:
#> (Intercept)           vB           vC
#>   -0.003759     2.032846    -0.983280
#>
#> Degrees of Freedom: 9999 Total (i.e. Null);  9997 Residual
#> Null Deviance:	    22410
#> Residual Deviance: 9817 	AIC: 28200
glm(y~v, data=sim(m,1e4,p=c("y~v:1"=3)))
#>
#> Call:  glm(formula = y ~ v, data = sim(m, 10000, p = c(y~v:1 = 3)))
#>
#> Coefficients:
#> (Intercept)           vB           vC
#>   -0.001579     3.033921    -0.988032
#>
#> Degrees of Freedom: 9999 Total (i.e. Null);  9997 Residual
#> Null Deviance:	    34150
#> Residual Deviance: 9792 	AIC: 28180

transform(m,v2~v) <- function(x) x=='A'
sim(m,10)
#>    v          y          z    v2
#> 1  B  2.7225014  4.3471577 FALSE
#> 2  B  3.4227940  2.6654917 FALSE
#> 3  A -0.7085905  2.1200748  TRUE
#> 4  C -1.4228910 -2.3743813 FALSE
#> 5  B  3.7990113  2.3934600 FALSE
#> 6  A  0.7750815 -0.3692173  TRUE
#> 7  C  0.1102358 -0.8214093 FALSE
#> 8  B  1.5153310  6.1654183 FALSE
#> 9  B  2.8497096  5.6700463 FALSE
#> 10 B  2.0529501  4.3634553 FALSE

##################################################
### Pre-calculate object
##################################################
m <- lvm(y~x)
m2 <- sim(m,'y~x'=2)
sim(m,10,'y~x'=2)
#>             y          x
#> 1  -3.1798033 -1.4756611
#> 2   0.7071417  1.0002274
#> 3   2.5909554  1.4401825
#> 4   4.2044588  2.1870327
#> 5  -1.2835247 -0.3542078
#> 6  -0.6652916  0.1662725
#> 7   4.3367737  0.5560635
#> 8  -2.3096870 -1.3264723
#> 9   1.0442205  0.1227436
#> 10 -2.4180767 -0.5474568
sim(m2,10) ## Faster
#>             y           x
#> 1   0.5160272 -0.32499194
#> 2   1.9639069  1.44766411
#> 3   0.8528626 -0.22711638
#> 4   1.3121923  1.50961388
#> 5   1.0176849  0.14595064
#> 6  -3.8403860 -1.48995665
#> 7   0.9179312 -0.13233502
#> 8   0.8011242  0.01558654
#> 9   2.1012753  1.01420451
#> 10 -1.4526002 -1.29120883