Simulate data from a general SEM model including non-linear effects and general link and distribution of variables.

# S3 method for lvm
sim(x, n = NULL, p = NULL, normal = FALSE, cond = FALSE,
sigma = 1, rho = 0.5, X = NULL, unlink=FALSE, latent=TRUE,
use.labels = TRUE, seed=NULL, ...)

Arguments

x

Model object

...

Additional arguments to be passed to the low level functions

n

Number of simulated values/individuals

p

Parameter value (optional)

normal

Logical indicating whether to simulate data from a multivariate normal distribution conditional on exogenous variables hence ignoring functional/distribution definition

cond

for internal use

sigma

Default residual variance (1)

rho

Default covariance parameter (0.5)

X

Optional matrix of fixed values of variables (manipulation)

unlink

Return Inverse link transformed data

latent

Include latent variables (default TRUE)

use.labels

convert categorical variables to factors before applying transformation

seed

Random seed

Author

Klaus K. Holst

Examples

################################################## ## Logistic regression ################################################## m <- lvm(y~x+z) regression(m) <- x~z distribution(m,~y+z) <- binomial.lvm("logit") d <- sim(m,1e3) head(d)
#> y x z #> 1 0 -1.3821237 0 #> 2 1 0.1592860 0 #> 3 0 -0.2024402 0 #> 4 1 -0.5407706 0 #> 5 1 3.0592772 1 #> 6 0 -1.0871117 0
e <- estimate(m,d,estimator="glm") e
#> Estimate Std. Error Z-value P-value #> Regressions: #> y~x 1.16436 0.09460 12.30808 <1e-12 #> y~z 0.92734 0.17162 5.40331 6.542e-08 #> x~z 1.17538 0.06522 18.02299 <1e-12 #> Intercepts: #> y -0.10564 0.10227 -1.03300 0.3016 #> x -0.08681 0.04530 -1.91634 0.05532 #> Dispersion: #> x 1.06572
## Simulate a few observation from estimated model sim(e,n=5)
#> y x z #> 1 0 0.09892573 1 #> 2 1 1.79234819 1 #> 3 1 1.39730452 1 #> 4 1 -0.34995267 0 #> 5 1 1.29523903 1
################################################## ## Poisson ################################################## distribution(m,~y) <- poisson.lvm() d <- sim(m,1e4,p=c(y=-1,"y~x"=2,z=1)) head(d)
#> y x z #> 1 16 1.4257214 1 #> 2 0 -1.4682438 0 #> 3 1 0.1739810 1 #> 4 1 0.3735584 1 #> 5 4 0.1141325 1 #> 6 0 -1.1831437 0
estimate(m,d,estimator="glm")
#> Estimate Std. Error Z-value P-value #> Regressions: #> y~x 1.99862 0.00167 1194.23410 <1e-12 #> y~z 1.00920 0.01135 88.93651 <1e-12 #> x~z 1.05711 0.02256 46.86270 <1e-12 #> Intercepts: #> y -1.00566 0.01169 -86.01586 <1e-12 #> x -0.04412 0.01927 -2.28899 0.02208 #> Dispersion: #> x 1.00309
mean(d$z); lava:::expit(1)
#> [1] 0.7254
#> [1] 0.7310586
summary(lm(y~x,sim(lvm(y[1:2]~4*x),1e3)))
#> #> Call: #> lm(formula = y ~ x, data = sim(lvm(y[1:2] ~ 4 * x), 1000)) #> #> Residuals: #> Min 1Q Median 3Q Max #> -5.2750 -0.9708 0.0370 0.9322 5.3931 #> #> Coefficients: #> Estimate Std. Error t value Pr(>|t|) #> (Intercept) 0.99595 0.04454 22.36 <2e-16 *** #> x 4.00101 0.04458 89.74 <2e-16 *** #> --- #> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 #> #> Residual standard error: 1.407 on 998 degrees of freedom #> Multiple R-squared: 0.8897, Adjusted R-squared: 0.8896 #> F-statistic: 8054 on 1 and 998 DF, p-value: < 2.2e-16 #>
################################################## ### Gamma distribution ################################################## m <- lvm(y~x) distribution(m,~y+x) <- list(Gamma.lvm(shape=2),binomial.lvm()) intercept(m,~y) <- 0.5 d <- sim(m,1e4) summary(g <- glm(y~x,family=Gamma(),data=d))
#> #> Call: #> glm(formula = y ~ x, family = Gamma(), data = d) #> #> Deviance Residuals: #> Min 1Q Median 3Q Max #> -2.8389 -0.6569 -0.1657 0.3067 2.7257 #> #> Coefficients: #> Estimate Std. Error t value Pr(>|t|) #> (Intercept) 0.504608 0.005085 99.23 <2e-16 *** #> x 1.014488 0.016342 62.08 <2e-16 *** #> --- #> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 #> #> (Dispersion parameter for Gamma family taken to be 0.5150804) #> #> Null deviance: 8427.8 on 9999 degrees of freedom #> Residual deviance: 5548.4 on 9998 degrees of freedom #> AIC: 20679 #> #> Number of Fisher Scoring iterations: 6 #>
if (FALSE) MASS::gamma.shape(g) args(lava::Gamma.lvm)
#> function (link = "inverse", shape, rate, unit = FALSE, var = FALSE, #> log = FALSE, ...) #> NULL
distribution(m,~y) <- Gamma.lvm(shape=2,log=TRUE) sim(m,10,p=c(y=0.5))[,"y"]
#> [1] 2.1503587 0.5181377 0.5199346 -0.3664227 0.8646908 1.1661599 #> [7] 0.0231124 1.7150221 -1.4569616 -0.0718344
################################################## ### Beta ################################################## m <- lvm() distribution(m,~y) <- beta.lvm(alpha=2,beta=1) var(sim(m,100,"y,y"=2))
#> y #> y 0.9935516
distribution(m,~y) <- beta.lvm(alpha=2,beta=1,scale=FALSE) var(sim(m,100))
#> y #> y 0.04904392
################################################## ### Transform ################################################## m <- lvm() transform(m,xz~x+z) <- function(x) x[1]*(x[2]>0) regression(m) <- y~x+z+xz d <- sim(m,1e3) summary(lm(y~x+z + x*I(z>0),d))
#> #> Call: #> lm(formula = y ~ x + z + x * I(z > 0), data = d) #> #> Residuals: #> Min 1Q Median 3Q Max #> -3.6065 -0.6391 0.0365 0.6927 3.0687 #> #> Coefficients: #> Estimate Std. Error t value Pr(>|t|) #> (Intercept) -0.017082 0.063323 -0.270 0.787 #> x 1.026723 0.045408 22.611 <2e-16 *** #> z 1.003452 0.054980 18.251 <2e-16 *** #> I(z > 0)TRUE 0.002987 0.109813 0.027 0.978 #> x:I(z > 0)TRUE 0.939556 0.063042 14.904 <2e-16 *** #> --- #> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 #> #> Residual standard error: 1.017 on 995 degrees of freedom #> Multiple R-squared: 0.7794, Adjusted R-squared: 0.7785 #> F-statistic: 878.7 on 4 and 995 DF, p-value: < 2.2e-16 #>
################################################## ### Non-random variables ################################################## m <- lvm() distribution(m,~x+z+v+w) <- list(Sequence.lvm(0,5),## Seq. 0 to 5 by 1/n Binary.lvm(), ## Vector of ones Binary.lvm(0.5), ## 0.5n 0, 0.5n 1 Binary.lvm(interval=list(c(0.3,0.5),c(0.8,1)))) sim(m,10)
#> x z v w #> 1 0.0000000 1 0 0 #> 2 0.5555556 1 0 0 #> 3 1.1111111 1 0 1 #> 4 1.6666667 1 0 1 #> 5 2.2222222 1 0 1 #> 6 2.7777778 1 1 0 #> 7 3.3333333 1 1 0 #> 8 3.8888889 1 1 1 #> 9 4.4444444 1 1 1 #> 10 5.0000000 1 1 1
################################################## ### Cox model ### piecewise constant hazard ################################################ m <- lvm(t~x) rates <- c(1,0.5); cuts <- c(0,5) ## Constant rate: 1 in [0,5), 0.5 in [5,Inf) distribution(m,~t) <- coxExponential.lvm(rate=rates,timecut=cuts) if (FALSE) { d <- sim(m,2e4,p=c("t~x"=0.1)); d$status <- TRUE plot(timereg::aalen(survival::Surv(t,status)~x,data=d, resample.iid=0,robust=0),spec=1) L <- approxfun(c(cuts,max(d$t)),f=1, cumsum(c(0,rates*diff(c(cuts,max(d$t))))), method="linear") curve(L,0,100,add=TRUE,col="blue") } ################################################## ### Cox model ### piecewise constant hazard, gamma frailty ################################################## m <- lvm(y~x+z) rates <- c(0.3,0.5); cuts <- c(0,5) distribution(m,~y+z) <- list(coxExponential.lvm(rate=rates,timecut=cuts), loggamma.lvm(rate=1,shape=1)) if (FALSE) { d <- sim(m,2e4,p=c("y~x"=0,"y~z"=0)); d$status <- TRUE plot(timereg::aalen(survival::Surv(y,status)~x,data=d, resample.iid=0,robust=0),spec=1) L <- approxfun(c(cuts,max(d$y)),f=1, cumsum(c(0,rates*diff(c(cuts,max(d$y))))), method="linear") curve(L,0,100,add=TRUE,col="blue") } ## Equivalent via transform (here with Aalens additive hazard model) m <- lvm(y~x) distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts) distribution(m,~z) <- Gamma.lvm(rate=1,shape=1) transform(m,t~y+z) <- prod sim(m,10)
#> y x z t #> 1 0.3504609 3.07734178 0.80047627 0.28053563 #> 2 -64.6491771 -0.31848175 0.39379954 -25.45881626 #> 3 6.0457242 -0.08502363 0.63887608 3.86246860 #> 4 1.9675872 0.81133022 0.70311623 1.38344251 #> 5 -1.1417809 -0.53137860 0.90058416 -1.02826980 #> 6 0.4674614 1.00179887 0.82935702 0.38769237 #> 7 0.6922404 0.79769099 0.08787368 0.06082971 #> 8 0.9707728 0.32823279 1.53201476 1.48723828 #> 9 -0.7374005 -0.49151840 0.50868500 -0.37510457 #> 10 1.8513379 1.02202747 0.59051378 1.09324054
## Shared frailty m <- lvm(c(t1,t2)~x+z) rates <- c(1,0.5); cuts <- c(0,5) distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts) distribution(m,~z) <- loggamma.lvm(rate=1,shape=1) if (FALSE) { mets::fast.reshape(sim(m,100),varying="t") } ################################################## ### General multivariate distributions ################################################## if (FALSE) { m <- lvm() distribution(m,~y1+y2,oratio=4) <- VGAM::rbiplackcop ksmooth2(sim(m,1e4),rgl=FALSE,theta=-20,phi=25) m <- lvm() distribution(m,~z1+z2,"or1") <- VGAM::rbiplackcop distribution(m,~y1+y2,"or2") <- VGAM::rbiplackcop sim(m,10,p=c(or1=0.1,or2=4)) } m <- lvm() distribution(m,~y1+y2+y3,TRUE) <- function(n,...) rmvn0(n,sigma=diag(3)+1) var(sim(m,100))
#> y1 y2 y3 #> y1 1.6549112 0.6228623 0.5262932 #> y2 0.6228623 1.5185498 0.7571519 #> y3 0.5262932 0.7571519 1.6762197
## Syntax also useful for univariate generators, e.g. m <- lvm(y~x+z) distribution(m,~y,TRUE) <- function(n) rnorm(n,mean=1000) sim(m,5)
#> y x z #> 1 1001.3455 0.5818875 -0.3013739 #> 2 1002.1450 -0.1350401 1.2848582 #> 3 1000.0822 -1.0407206 1.5686740 #> 4 994.5267 -2.3053292 -1.8824933 #> 5 998.4568 -1.3360425 -0.3062621
distribution(m,~y,"m1",0) <- rnorm sim(m,5)
#> y x z #> 1 -0.5396791 0.3417136 -1.6480163 #> 2 -3.6095991 -1.6256374 -0.9574870 #> 3 2.1967654 -0.4768969 2.2250715 #> 4 -0.3940713 -0.6209352 -0.6273853 #> 5 -0.7275340 0.5040422 0.3378076
sim(m,5,p=c(m1=100))
#> y x z #> 1 102.18755 0.1787579 0.5007453 #> 2 98.04506 0.8271937 -1.8126868 #> 3 96.84215 -0.7196575 -1.4649504 #> 4 101.46870 1.6884779 -0.5837411 #> 5 95.68423 -1.1575305 -1.1663874
################################################## ### Regression design in other parameters ################################################## ## Variance heterogeneity m <- lvm(y~x) distribution(m,~y) <- function(n,mean,x) rnorm(n,mean,exp(x)^.5) if (interactive()) plot(y~x,sim(m,1e3)) ## Alternaively, calculate the standard error directly addvar(m) <- ~sd ## If 'sd' should be part of the resulting data.frame constrain(m,sd~x) <- function(x) exp(x)^.5 distribution(m,~y) <- function(n,mean,sd) rnorm(n,mean,sd) if (interactive()) plot(y~x,sim(m,1e3)) ## Regression on variance parameter m <- lvm() regression(m) <- y~x regression(m) <- v~x ##distribution(m,~v) <- 0 # No stochastic term ## Alternative: ## regression(m) <- v[NA:0]~x distribution(m,~y) <- function(n,mean,v) rnorm(n,mean,exp(v)^.5) if (interactive()) plot(y~x,sim(m,1e3)) ## Regression on shape parameter in Weibull model m <- lvm() regression(m) <- y ~ z+v regression(m) <- s ~ exp(0.6*x-0.5*z) distribution(m,~x+z) <- binomial.lvm() distribution(m,~cens) <- coxWeibull.lvm(scale=1) distribution(m,~y) <- coxWeibull.lvm(scale=0.1,shape=~s) eventTime(m) <- time ~ min(y=1,cens=0) if (interactive()) { d <- sim(m,1e3) require(survival) (cc <- coxph(Surv(time,status)~v+strata(x,z),data=d)) plot(survfit(cc) ,col=1:4,mark.time=FALSE) } ################################################## ### Categorical predictor ################################################## m <- lvm() ## categorical(m,K=3) <- "v" categorical(m,labels=c("A","B","C")) <- "v" regression(m,additive=FALSE) <- y~v if (FALSE) { plot(y~v,sim(m,1000,p=c("y~v:2"=3))) } m <- lvm() categorical(m,labels=c("A","B","C"),p=c(0.5,0.3)) <- "v" regression(m,additive=FALSE,beta=c(0,2,-1)) <- y~v ## equivalent to: ## regression(m,y~v,additive=FALSE) <- c(0,2,-1) regression(m,additive=FALSE,beta=c(0,4,-1)) <- z~v table(sim(m,1e4)$v)
#> #> A B C #> 5000 3051 1949
glm(y~v, data=sim(m,1e4))
#> #> Call: glm(formula = y ~ v, data = sim(m, 10000)) #> #> Coefficients: #> (Intercept) vB vC #> -0.003759 2.032846 -0.983280 #> #> Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual #> Null Deviance: 22410 #> Residual Deviance: 9817 AIC: 28200
glm(y~v, data=sim(m,1e4,p=c("y~v:1"=3)))
#> #> Call: glm(formula = y ~ v, data = sim(m, 10000, p = c(`y~v:1` = 3))) #> #> Coefficients: #> (Intercept) vB vC #> -0.001579 3.033921 -0.988032 #> #> Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual #> Null Deviance: 34150 #> Residual Deviance: 9792 AIC: 28180
transform(m,v2~v) <- function(x) x=='A' sim(m,10)
#> v y z v2 #> 1 B 2.7225014 4.3471577 FALSE #> 2 B 3.4227940 2.6654917 FALSE #> 3 A -0.7085905 2.1200748 TRUE #> 4 C -1.4228910 -2.3743813 FALSE #> 5 B 3.7990113 2.3934600 FALSE #> 6 A 0.7750815 -0.3692173 TRUE #> 7 C 0.1102358 -0.8214093 FALSE #> 8 B 1.5153310 6.1654183 FALSE #> 9 B 2.8497096 5.6700463 FALSE #> 10 B 2.0529501 4.3634553 FALSE
################################################## ### Pre-calculate object ################################################## m <- lvm(y~x) m2 <- sim(m,'y~x'=2) sim(m,10,'y~x'=2)
#> y x #> 1 -3.1798033 -1.4756611 #> 2 0.7071417 1.0002274 #> 3 2.5909554 1.4401825 #> 4 4.2044588 2.1870327 #> 5 -1.2835247 -0.3542078 #> 6 -0.6652916 0.1662725 #> 7 4.3367737 0.5560635 #> 8 -2.3096870 -1.3264723 #> 9 1.0442205 0.1227436 #> 10 -2.4180767 -0.5474568
sim(m2,10) ## Faster
#> y x #> 1 0.5160272 -0.32499194 #> 2 1.9639069 1.44766411 #> 3 0.8528626 -0.22711638 #> 4 1.3121923 1.50961388 #> 5 1.0176849 0.14595064 #> 6 -3.8403860 -1.48995665 #> 7 0.9179312 -0.13233502 #> 8 0.8011242 0.01558654 #> 9 2.1012753 1.01420451 #> 10 -1.4526002 -1.29120883