Simulate data from a general SEM model including non-linear effects and general link and distribution of variables.
# S3 method for lvm
sim(x, n = NULL, p = NULL, normal = FALSE, cond = FALSE,
sigma = 1, rho = 0.5, X = NULL, unlink=FALSE, latent=TRUE,
use.labels = TRUE, seed=NULL, ...)
Model object
Additional arguments to be passed to the low level functions
Number of simulated values/individuals
Parameter value (optional)
Logical indicating whether to simulate data from a multivariate normal distribution conditional on exogenous variables hence ignoring functional/distribution definition
for internal use
Default residual variance (1)
Default covariance parameter (0.5)
Optional matrix of fixed values of variables (manipulation)
Return Inverse link transformed data
Include latent variables (default TRUE)
convert categorical variables to factors before applying transformation
Random seed
##################################################
## Logistic regression
##################################################
m <- lvm(y~x+z)
regression(m) <- x~z
distribution(m,~y+z) <- binomial.lvm("logit")
d <- sim(m,1e3)
head(d)
#> y x z
#> 1 0 -1.3821237 0
#> 2 1 0.1592860 0
#> 3 0 -0.2024402 0
#> 4 1 -0.5407706 0
#> 5 1 3.0592772 1
#> 6 0 -1.0871117 0
e <- estimate(m,d,estimator="glm")
e
#> Estimate Std. Error Z-value P-value
#> Regressions:
#> y~x 1.16436 0.09460 12.30808 <1e-12
#> y~z 0.92734 0.17162 5.40331 6.542e-08
#> x~z 1.17538 0.06522 18.02299 <1e-12
#> Intercepts:
#> y -0.10564 0.10227 -1.03300 0.3016
#> x -0.08681 0.04530 -1.91634 0.05532
#> Dispersion:
#> x 1.06572
## Simulate a few observation from estimated model
sim(e,n=5)
#> y x z
#> 1 0 0.09892573 1
#> 2 1 1.79234819 1
#> 3 1 1.39730452 1
#> 4 1 -0.34995267 0
#> 5 1 1.29523903 1
##################################################
## Poisson
##################################################
distribution(m,~y) <- poisson.lvm()
d <- sim(m,1e4,p=c(y=-1,"y~x"=2,z=1))
head(d)
#> y x z
#> 1 16 1.4257214 1
#> 2 0 -1.4682438 0
#> 3 1 0.1739810 1
#> 4 1 0.3735584 1
#> 5 4 0.1141325 1
#> 6 0 -1.1831437 0
estimate(m,d,estimator="glm")
#> Estimate Std. Error Z-value P-value
#> Regressions:
#> y~x 1.99862 0.00167 1194.23410 <1e-12
#> y~z 1.00920 0.01135 88.93651 <1e-12
#> x~z 1.05711 0.02256 46.86270 <1e-12
#> Intercepts:
#> y -1.00566 0.01169 -86.01586 <1e-12
#> x -0.04412 0.01927 -2.28899 0.02208
#> Dispersion:
#> x 1.00309
mean(d$z); lava:::expit(1)
#> [1] 0.7254
#> [1] 0.7310586
summary(lm(y~x,sim(lvm(y[1:2]~4*x),1e3)))
#>
#> Call:
#> lm(formula = y ~ x, data = sim(lvm(y[1:2] ~ 4 * x), 1000))
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -5.2750 -0.9708 0.0370 0.9322 5.3931
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 0.99595 0.04454 22.36 <2e-16 ***
#> x 4.00101 0.04458 89.74 <2e-16 ***
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 1.407 on 998 degrees of freedom
#> Multiple R-squared: 0.8897, Adjusted R-squared: 0.8896
#> F-statistic: 8054 on 1 and 998 DF, p-value: < 2.2e-16
#>
##################################################
### Gamma distribution
##################################################
m <- lvm(y~x)
distribution(m,~y+x) <- list(Gamma.lvm(shape=2),binomial.lvm())
intercept(m,~y) <- 0.5
d <- sim(m,1e4)
summary(g <- glm(y~x,family=Gamma(),data=d))
#>
#> Call:
#> glm(formula = y ~ x, family = Gamma(), data = d)
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 0.504608 0.005085 99.23 <2e-16 ***
#> x 1.014488 0.016342 62.08 <2e-16 ***
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> (Dispersion parameter for Gamma family taken to be 0.5150804)
#>
#> Null deviance: 8427.8 on 9999 degrees of freedom
#> Residual deviance: 5548.4 on 9998 degrees of freedom
#> AIC: 20679
#>
#> Number of Fisher Scoring iterations: 6
#>
if (FALSE) MASS::gamma.shape(g)
args(lava::Gamma.lvm)
#> function (link = "inverse", shape, rate, unit = FALSE, var = FALSE,
#> log = FALSE, ...)
#> NULL
distribution(m,~y) <- Gamma.lvm(shape=2,log=TRUE)
sim(m,10,p=c(y=0.5))[,"y"]
#> [1] 2.1503587 0.5181377 0.5199346 -0.3664227 0.8646908 1.1661599
#> [7] 0.0231124 1.7150221 -1.4569616 -0.0718344
##################################################
### Beta
##################################################
m <- lvm()
distribution(m,~y) <- beta.lvm(alpha=2,beta=1)
var(sim(m,100,"y,y"=2))
#> y
#> y 0.9935516
distribution(m,~y) <- beta.lvm(alpha=2,beta=1,scale=FALSE)
var(sim(m,100))
#> y
#> y 0.04904392
##################################################
### Transform
##################################################
m <- lvm()
transform(m,xz~x+z) <- function(x) x[1]*(x[2]>0)
regression(m) <- y~x+z+xz
d <- sim(m,1e3)
summary(lm(y~x+z + x*I(z>0),d))
#>
#> Call:
#> lm(formula = y ~ x + z + x * I(z > 0), data = d)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -3.6065 -0.6391 0.0365 0.6927 3.0687
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -0.017082 0.063323 -0.270 0.787
#> x 1.026723 0.045408 22.611 <2e-16 ***
#> z 1.003452 0.054980 18.251 <2e-16 ***
#> I(z > 0)TRUE 0.002987 0.109813 0.027 0.978
#> x:I(z > 0)TRUE 0.939556 0.063042 14.904 <2e-16 ***
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 1.017 on 995 degrees of freedom
#> Multiple R-squared: 0.7794, Adjusted R-squared: 0.7785
#> F-statistic: 878.7 on 4 and 995 DF, p-value: < 2.2e-16
#>
##################################################
### Non-random variables
##################################################
m <- lvm()
distribution(m,~x+z+v+w) <- list(Sequence.lvm(0,5),## Seq. 0 to 5 by 1/n
Binary.lvm(), ## Vector of ones
Binary.lvm(0.5), ## 0.5n 0, 0.5n 1
Binary.lvm(interval=list(c(0.3,0.5),c(0.8,1))))
sim(m,10)
#> x z v w
#> 1 0.0000000 1 0 0
#> 2 0.5555556 1 0 0
#> 3 1.1111111 1 0 1
#> 4 1.6666667 1 0 1
#> 5 2.2222222 1 0 1
#> 6 2.7777778 1 1 0
#> 7 3.3333333 1 1 0
#> 8 3.8888889 1 1 1
#> 9 4.4444444 1 1 1
#> 10 5.0000000 1 1 1
##################################################
### Cox model
### piecewise constant hazard
################################################
m <- lvm(t~x)
rates <- c(1,0.5); cuts <- c(0,5)
## Constant rate: 1 in [0,5), 0.5 in [5,Inf)
distribution(m,~t) <- coxExponential.lvm(rate=rates,timecut=cuts)
if (FALSE) {
d <- sim(m,2e4,p=c("t~x"=0.1)); d$status <- TRUE
plot(timereg::aalen(survival::Surv(t,status)~x,data=d,
resample.iid=0,robust=0),spec=1)
L <- approxfun(c(cuts,max(d$t)),f=1,
cumsum(c(0,rates*diff(c(cuts,max(d$t))))),
method="linear")
curve(L,0,100,add=TRUE,col="blue")
}
##################################################
### Cox model
### piecewise constant hazard, gamma frailty
##################################################
m <- lvm(y~x+z)
rates <- c(0.3,0.5); cuts <- c(0,5)
distribution(m,~y+z) <- list(coxExponential.lvm(rate=rates,timecut=cuts),
loggamma.lvm(rate=1,shape=1))
if (FALSE) {
d <- sim(m,2e4,p=c("y~x"=0,"y~z"=0)); d$status <- TRUE
plot(timereg::aalen(survival::Surv(y,status)~x,data=d,
resample.iid=0,robust=0),spec=1)
L <- approxfun(c(cuts,max(d$y)),f=1,
cumsum(c(0,rates*diff(c(cuts,max(d$y))))),
method="linear")
curve(L,0,100,add=TRUE,col="blue")
}
## Equivalent via transform (here with Aalens additive hazard model)
m <- lvm(y~x)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- Gamma.lvm(rate=1,shape=1)
transform(m,t~y+z) <- prod
sim(m,10)
#> y x z t
#> 1 0.3504609 3.07734178 0.80047627 0.28053563
#> 2 -64.6491771 -0.31848175 0.39379954 -25.45881626
#> 3 6.0457242 -0.08502363 0.63887608 3.86246860
#> 4 1.9675872 0.81133022 0.70311623 1.38344251
#> 5 -1.1417809 -0.53137860 0.90058416 -1.02826980
#> 6 0.4674614 1.00179887 0.82935702 0.38769237
#> 7 0.6922404 0.79769099 0.08787368 0.06082971
#> 8 0.9707728 0.32823279 1.53201476 1.48723828
#> 9 -0.7374005 -0.49151840 0.50868500 -0.37510457
#> 10 1.8513379 1.02202747 0.59051378 1.09324054
## Shared frailty
m <- lvm(c(t1,t2)~x+z)
rates <- c(1,0.5); cuts <- c(0,5)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- loggamma.lvm(rate=1,shape=1)
if (FALSE) {
mets::fast.reshape(sim(m,100),varying="t")
}
##################################################
### General multivariate distributions
##################################################
if (FALSE) {
m <- lvm()
distribution(m,~y1+y2,oratio=4) <- VGAM::rbiplackcop
ksmooth2(sim(m,1e4),rgl=FALSE,theta=-20,phi=25)
m <- lvm()
distribution(m,~z1+z2,"or1") <- VGAM::rbiplackcop
distribution(m,~y1+y2,"or2") <- VGAM::rbiplackcop
sim(m,10,p=c(or1=0.1,or2=4))
}
m <- lvm()
distribution(m,~y1+y2+y3,TRUE) <- function(n,...) rmvn0(n,sigma=diag(3)+1)
var(sim(m,100))
#> y1 y2 y3
#> y1 1.6549112 0.6228623 0.5262932
#> y2 0.6228623 1.5185498 0.7571519
#> y3 0.5262932 0.7571519 1.6762197
## Syntax also useful for univariate generators, e.g.
m <- lvm(y~x+z)
distribution(m,~y,TRUE) <- function(n) rnorm(n,mean=1000)
sim(m,5)
#> y x z
#> 1 1001.3455 0.5818875 -0.3013739
#> 2 1002.1450 -0.1350401 1.2848582
#> 3 1000.0822 -1.0407206 1.5686740
#> 4 994.5267 -2.3053292 -1.8824933
#> 5 998.4568 -1.3360425 -0.3062621
distribution(m,~y,"m1",0) <- rnorm
sim(m,5)
#> y x z
#> 1 -0.5396791 0.3417136 -1.6480163
#> 2 -3.6095991 -1.6256374 -0.9574870
#> 3 2.1967654 -0.4768969 2.2250715
#> 4 -0.3940713 -0.6209352 -0.6273853
#> 5 -0.7275340 0.5040422 0.3378076
sim(m,5,p=c(m1=100))
#> y x z
#> 1 102.18755 0.1787579 0.5007453
#> 2 98.04506 0.8271937 -1.8126868
#> 3 96.84215 -0.7196575 -1.4649504
#> 4 101.46870 1.6884779 -0.5837411
#> 5 95.68423 -1.1575305 -1.1663874
##################################################
### Regression design in other parameters
##################################################
## Variance heterogeneity
m <- lvm(y~x)
distribution(m,~y) <- function(n,mean,x) rnorm(n,mean,exp(x)^.5)
if (interactive()) plot(y~x,sim(m,1e3))
## Alternaively, calculate the standard error directly
addvar(m) <- ~sd ## If 'sd' should be part of the resulting data.frame
constrain(m,sd~x) <- function(x) exp(x)^.5
distribution(m,~y) <- function(n,mean,sd) rnorm(n,mean,sd)
if (interactive()) plot(y~x,sim(m,1e3))
## Regression on variance parameter
m <- lvm()
regression(m) <- y~x
regression(m) <- v~x
##distribution(m,~v) <- 0 # No stochastic term
## Alternative:
## regression(m) <- v[NA:0]~x
distribution(m,~y) <- function(n,mean,v) rnorm(n,mean,exp(v)^.5)
if (interactive()) plot(y~x,sim(m,1e3))
## Regression on shape parameter in Weibull model
m <- lvm()
regression(m) <- y ~ z+v
regression(m) <- s ~ exp(0.6*x-0.5*z)
distribution(m,~x+z) <- binomial.lvm()
distribution(m,~cens) <- coxWeibull.lvm(scale=1)
distribution(m,~y) <- coxWeibull.lvm(scale=0.1,shape=~s)
eventTime(m) <- time ~ min(y=1,cens=0)
if (interactive()) {
d <- sim(m,1e3)
require(survival)
(cc <- coxph(Surv(time,status)~v+strata(x,z),data=d))
plot(survfit(cc) ,col=1:4,mark.time=FALSE)
}
##################################################
### Categorical predictor
##################################################
m <- lvm()
## categorical(m,K=3) <- "v"
categorical(m,labels=c("A","B","C")) <- "v"
regression(m,additive=FALSE) <- y~v
if (FALSE) {
plot(y~v,sim(m,1000,p=c("y~v:2"=3)))
}
m <- lvm()
categorical(m,labels=c("A","B","C"),p=c(0.5,0.3)) <- "v"
regression(m,additive=FALSE,beta=c(0,2,-1)) <- y~v
## equivalent to:
## regression(m,y~v,additive=FALSE) <- c(0,2,-1)
regression(m,additive=FALSE,beta=c(0,4,-1)) <- z~v
table(sim(m,1e4)$v)
#>
#> A B C
#> 5000 3051 1949
glm(y~v, data=sim(m,1e4))
#>
#> Call: glm(formula = y ~ v, data = sim(m, 10000))
#>
#> Coefficients:
#> (Intercept) vB vC
#> -0.003759 2.032846 -0.983280
#>
#> Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual
#> Null Deviance: 22410
#> Residual Deviance: 9817 AIC: 28200
glm(y~v, data=sim(m,1e4,p=c("y~v:1"=3)))
#>
#> Call: glm(formula = y ~ v, data = sim(m, 10000, p = c(`y~v:1` = 3)))
#>
#> Coefficients:
#> (Intercept) vB vC
#> -0.001579 3.033921 -0.988032
#>
#> Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual
#> Null Deviance: 34150
#> Residual Deviance: 9792 AIC: 28180
transform(m,v2~v) <- function(x) x=='A'
sim(m,10)
#> v y z v2
#> 1 B 2.7225014 4.3471577 FALSE
#> 2 B 3.4227940 2.6654917 FALSE
#> 3 A -0.7085905 2.1200748 TRUE
#> 4 C -1.4228910 -2.3743813 FALSE
#> 5 B 3.7990113 2.3934600 FALSE
#> 6 A 0.7750815 -0.3692173 TRUE
#> 7 C 0.1102358 -0.8214093 FALSE
#> 8 B 1.5153310 6.1654183 FALSE
#> 9 B 2.8497096 5.6700463 FALSE
#> 10 B 2.0529501 4.3634553 FALSE
##################################################
### Pre-calculate object
##################################################
m <- lvm(y~x)
m2 <- sim(m,'y~x'=2)
sim(m,10,'y~x'=2)
#> y x
#> 1 -3.1798033 -1.4756611
#> 2 0.7071417 1.0002274
#> 3 2.5909554 1.4401825
#> 4 4.2044588 2.1870327
#> 5 -1.2835247 -0.3542078
#> 6 -0.6652916 0.1662725
#> 7 4.3367737 0.5560635
#> 8 -2.3096870 -1.3264723
#> 9 1.0442205 0.1227436
#> 10 -2.4180767 -0.5474568
sim(m2,10) ## Faster
#> y x
#> 1 0.5160272 -0.32499194
#> 2 1.9639069 1.44766411
#> 3 0.8528626 -0.22711638
#> 4 1.3121923 1.50961388
#> 5 1.0176849 0.14595064
#> 6 -3.8403860 -1.48995665
#> 7 0.9179312 -0.13233502
#> 8 0.8011242 0.01558654
#> 9 2.1012753 1.01420451
#> 10 -1.4526002 -1.29120883