Simulate data from a general SEM model including non-linear effects and general link and distribution of variables.
Usage
# S3 method for class 'lvm'
sim(x, n = NULL, p = NULL, normal = FALSE, cond = FALSE,
sigma = 1, rho = 0.5, X = NULL, unlink=FALSE, latent=TRUE,
use.labels = TRUE, seed=NULL, ...)Arguments
- x
Model object
- n
Number of simulated values/individuals
- p
Parameter value (optional)
- normal
Logical indicating whether to simulate data from a multivariate normal distribution conditional on exogenous variables hence ignoring functional/distribution definition
- cond
for internal use
- sigma
Default residual variance (1)
- rho
Default covariance parameter (0.5)
- X
Optional matrix of fixed values of variables (manipulation)
- unlink
Return Inverse link transformed data
- latent
Include latent variables (default TRUE)
- use.labels
convert categorical variables to factors before applying transformation
- seed
Random seed
- ...
Additional arguments to be passed to the low level functions
Examples
##################################################
## Logistic regression
##################################################
m <- lvm(y~x+z)
regression(m) <- x~z
distribution(m,~y+z) <- binomial.lvm("logit")
d <- sim(m,1e3)
head(d)
#> y x z
#> 1 0 -1.0871117 0
#> 2 0 -0.5961507 0
#> 3 1 1.2456606 1
#> 4 0 -1.9011011 0
#> 5 1 0.9317311 0
#> 6 0 -1.0195031 0
e <- estimate(m,d,estimator="glm")
e
#> Estimate Std. Error Z-value P-value
#> Regressions:
#> y~x 0.97832 0.09345 10.46919 <1e-12
#> y~z 1.04098 0.17281 6.02404 1.701e-09
#> x~z 1.11747 0.06529 17.11593 <1e-12
#> Intercepts:
#> y -0.01437 0.09874 -0.14555 0.8843
#> x -0.05580 0.04649 -1.20022 0.2301
#> Dispersion:
#> x 1.06783
## Simulate a few observation from estimated model
sim(e,n=5)
#> y x z
#> 1 0 -0.1463386 0
#> 2 1 0.9463616 1
#> 3 1 0.1632782 1
#> 4 0 -1.2455533 0
#> 5 1 2.8957783 1
##################################################
## Poisson
##################################################
distribution(m,~y) <- poisson.lvm()
d <- sim(m,1e4,p=c(y=-1,"y~x"=2,z=1))
head(d)
#> y x z
#> 1 1 -0.1831437 1
#> 2 3 0.4235717 1
#> 3 19 1.4638566 1
#> 4 0 -1.3786736 1
#> 5 0 0.7720821 0
#> 6 2 -0.4325138 1
estimate(m,d,estimator="glm")
#> Estimate Std. Error Z-value P-value
#> Regressions:
#> y~x 2.00013 0.00170 1179.77248 <1e-12
#> y~z 0.99804 0.01139 87.63590 <1e-12
#> x~z 0.99225 0.02256 43.99038 <1e-12
#> Intercepts:
#> y -0.99834 0.01161 -85.98726 <1e-12
#> x 0.00295 0.01927 0.15321 0.8782
#> Dispersion:
#> x 1.00388
mean(d$z); lava:::expit(1)
#> [1] 0.7255
#> [1] 0.7310586
summary(lm(y~x,sim(lvm(y[1:2]~4*x),1e3)))
#>
#> Call:
#> lm(formula = y ~ x, data = sim(lvm(y[1:2] ~ 4 * x), 1000))
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -5.3176 -0.9817 0.0425 0.9454 5.3563
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 1.01691 0.04476 22.72 <2e-16 ***
#> x 3.98601 0.04489 88.80 <2e-16 ***
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 1.414 on 998 degrees of freedom
#> Multiple R-squared: 0.8877, Adjusted R-squared: 0.8875
#> F-statistic: 7885 on 1 and 998 DF, p-value: < 2.2e-16
#>
##################################################
### Gamma distribution
##################################################
m <- lvm(y~x)
distribution(m,~y+x) <- list(Gamma.lvm(shape=2),binomial.lvm())
intercept(m,~y) <- 0.5
d <- sim(m,1e4)
summary(g <- glm(y~x,family=Gamma(),data=d))
#>
#> Call:
#> glm(formula = y ~ x, family = Gamma(), data = d)
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 0.503872 0.005074 99.30 <2e-16 ***
#> x 1.015755 0.016366 62.07 <2e-16 ***
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> (Dispersion parameter for Gamma family taken to be 0.5154657)
#>
#> Null deviance: 8439.4 on 9999 degrees of freedom
#> Residual deviance: 5553.3 on 9998 degrees of freedom
#> AIC: 20719
#>
#> Number of Fisher Scoring iterations: 6
#>
if (FALSE) MASS::gamma.shape(g) # \dontrun{}
args(lava::Gamma.lvm)
#> function (link = "inverse", shape, rate, unit = FALSE, var = FALSE,
#> log = FALSE, ...)
#> NULL
distribution(m,~y) <- Gamma.lvm(shape=2,log=TRUE)
sim(m,10,p=c(y=0.5))[,"y"]
#> [1] -0.39232531 -0.65344132 -0.25008661 1.02265025 -0.29470894 -0.75875908
#> [7] -0.61782561 -0.01173492 0.22658620 -0.85462986
##################################################
### Beta
##################################################
m <- lvm()
distribution(m,~y) <- beta.lvm(alpha=2,beta=1)
var(sim(m,100,"y,y"=2))
#> y
#> y 1.038929
distribution(m,~y) <- beta.lvm(alpha=2,beta=1,scale=FALSE)
var(sim(m,100))
#> y
#> y 0.06458889
##################################################
### Transform
##################################################
m <- lvm()
transform(m,xz~x+z) <- function(x) x[1]*(x[2]>0)
regression(m) <- y~x+z+xz
d <- sim(m,1e3)
summary(lm(y~x+z + x*I(z>0),d))
#>
#> Call:
#> lm(formula = y ~ x + z + x * I(z > 0), data = d)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -3.1733 -0.6659 -0.0400 0.7135 3.6610
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 0.03640 0.06153 0.592 0.554
#> x 0.97845 0.04709 20.778 <2e-16 ***
#> z 1.01007 0.05412 18.663 <2e-16 ***
#> I(z > 0)TRUE -0.01420 0.10558 -0.134 0.893
#> x:I(z > 0)TRUE 1.02565 0.06515 15.744 <2e-16 ***
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 1.015 on 995 degrees of freedom
#> Multiple R-squared: 0.7725, Adjusted R-squared: 0.7716
#> F-statistic: 844.9 on 4 and 995 DF, p-value: < 2.2e-16
#>
##################################################
### Non-random variables
##################################################
m <- lvm()
distribution(m,~x+z+v+w) <- list(Sequence.lvm(0,5),## Seq. 0 to 5 by 1/n
Binary.lvm(), ## Vector of ones
Binary.lvm(0.5), ## 0.5n 0, 0.5n 1
Binary.lvm(interval=list(c(0.3,0.5),c(0.8,1))))
sim(m,10)
#> x z v w
#> 1 0.0000000 1 0 0
#> 2 0.5555556 1 0 0
#> 3 1.1111111 1 0 1
#> 4 1.6666667 1 0 1
#> 5 2.2222222 1 0 1
#> 6 2.7777778 1 1 0
#> 7 3.3333333 1 1 0
#> 8 3.8888889 1 1 1
#> 9 4.4444444 1 1 1
#> 10 5.0000000 1 1 1
##################################################
### Cox model
### piecewise constant hazard
################################################
m <- lvm(t~x)
rates <- c(1,0.5); cuts <- c(0,5)
## Constant rate: 1 in [0,5), 0.5 in [5,Inf)
distribution(m,~t) <- coxExponential.lvm(rate=rates,timecut=cuts)
if (FALSE) { # \dontrun{
d <- sim(m,2e4,p=c("t~x"=0.1)); d$status <- TRUE
plot(timereg::aalen(survival::Surv(t,status)~x,data=d,
resample.iid=0,robust=0),spec=1)
L <- approxfun(c(cuts,max(d$t)),f=1,
cumsum(c(0,rates*diff(c(cuts,max(d$t))))),
method="linear")
curve(L,0,100,add=TRUE,col="blue")
} # }
##################################################
### Cox model
### piecewise constant hazard, gamma frailty
##################################################
m <- lvm(y~x+z)
rates <- c(0.3,0.5); cuts <- c(0,5)
distribution(m,~y+z) <- list(coxExponential.lvm(rate=rates,timecut=cuts),
loggamma.lvm(rate=1,shape=1))
if (FALSE) { # \dontrun{
d <- sim(m,2e4,p=c("y~x"=0,"y~z"=0)); d$status <- TRUE
plot(timereg::aalen(survival::Surv(y,status)~x,data=d,
resample.iid=0,robust=0),spec=1)
L <- approxfun(c(cuts,max(d$y)),f=1,
cumsum(c(0,rates*diff(c(cuts,max(d$y))))),
method="linear")
curve(L,0,100,add=TRUE,col="blue")
} # }
## Equivalent via transform (here with Aalens additive hazard model)
m <- lvm(y~x)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- Gamma.lvm(rate=1,shape=1)
transform(m,t~y+z) <- prod
sim(m,10)
#> y x z t
#> 1 0.5579720 2.120772086 0.33831431 0.18876989
#> 2 1.4494352 -0.135160928 3.33297133 4.83092594
#> 3 0.4875242 1.070643367 0.04153628 0.02024994
#> 4 0.2123152 1.886413966 2.46505719 0.52336918
#> 5 0.2478579 1.136382241 0.34452875 0.08539416
#> 6 -0.7640068 -1.826504666 0.01332605 -0.01018119
#> 7 0.5955366 1.284139328 0.23062140 0.13734349
#> 8 0.7351995 -0.073520000 0.44168647 0.32472769
#> 9 0.9358395 -0.001793702 0.57737245 0.54032796
#> 10 0.9575549 0.155946016 1.98820808 1.90381833
## Shared frailty
m <- lvm(c(t1,t2)~x+z)
rates <- c(1,0.5); cuts <- c(0,5)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- loggamma.lvm(rate=1,shape=1)
if (FALSE) { # \dontrun{
mets::fast.reshape(sim(m,100),varying="t")
} # }
##################################################
### General multivariate distributions
##################################################
if (FALSE) { # \dontrun{
m <- lvm()
distribution(m,~y1+y2,oratio=4) <- VGAM::rbiplackcop
ksmooth2(sim(m,1e4),rgl=FALSE,theta=-20,phi=25)
m <- lvm()
distribution(m,~z1+z2,"or1") <- VGAM::rbiplackcop
distribution(m,~y1+y2,"or2") <- VGAM::rbiplackcop
sim(m,10,p=c(or1=0.1,or2=4))
} # }
m <- lvm()
distribution(m,~y1+y2+y3,TRUE) <- function(n,...) rmvn0(n,sigma=diag(3)+1)
var(sim(m,100))
#> y1 y2 y3
#> y1 2.673436 1.237220 1.280251
#> y2 1.237220 1.979091 1.121231
#> y3 1.280251 1.121231 2.231319
## Syntax also useful for univariate generators, e.g.
m <- lvm(y~x+z)
distribution(m,~y,TRUE) <- function(n) rnorm(n,mean=1000)
sim(m,5)
#> y x z
#> 1 999.6598 -1.2941958 -0.21599390
#> 2 995.4996 -1.3899030 -1.79740633
#> 3 1002.6299 1.0073420 0.46536012
#> 4 1000.0435 -1.4961390 0.72058077
#> 5 1000.0448 0.5213245 -0.00315081
distribution(m,~y,"m1",0) <- rnorm
sim(m,5)
#> y x z
#> 1 -0.27522454 -1.2971259 0.21634297
#> 2 -1.80708496 0.4350938 -1.58337733
#> 3 0.05877631 0.5108942 -0.38597289
#> 4 2.84314360 1.5773881 -0.02885458
#> 5 -0.35209704 -1.0659894 1.81281571
sim(m,5,p=c(m1=100))
#> y x z
#> 1 101.0689 -0.35860612 0.2469560
#> 2 101.9527 1.93296840 -1.6293429
#> 3 100.7706 0.10528753 -0.4887392
#> 4 100.4315 -0.09999915 1.4974758
#> 5 100.7946 -0.13847376 -0.2257985
##################################################
### Regression design in other parameters
##################################################
## Variance heterogeneity
m <- lvm(y~x)
distribution(m,~y) <- function(n,mean,x) rnorm(n,mean,exp(x)^.5)
if (interactive()) plot(y~x,sim(m,1e3))
## Alternaively, calculate the standard error directly
addvar(m) <- ~sd ## If 'sd' should be part of the resulting data.frame
constrain(m,sd~x) <- function(x) exp(x)^.5
distribution(m,~y) <- function(n,mean,sd) rnorm(n,mean,sd)
if (interactive()) plot(y~x,sim(m,1e3))
## Regression on variance parameter
m <- lvm()
regression(m) <- y~x
regression(m) <- v~x
##distribution(m,~v) <- 0 # No stochastic term
## Alternative:
## regression(m) <- v[NA:0]~x
distribution(m,~y) <- function(n,mean,v) rnorm(n,mean,exp(v)^.5)
if (interactive()) plot(y~x,sim(m,1e3))
## Regression on shape parameter in Weibull model
m <- lvm()
regression(m) <- y ~ z+v
regression(m) <- s ~ exp(0.6*x-0.5*z)
distribution(m,~x+z) <- binomial.lvm()
distribution(m,~cens) <- coxWeibull.lvm(scale=1)
distribution(m,~y) <- coxWeibull.lvm(scale=0.1,shape=~s)
eventTime(m) <- time ~ min(y=1,cens=0)
if (interactive()) {
d <- sim(m,1e3)
require(survival)
(cc <- coxph(Surv(time,status)~v+strata(x,z),data=d))
plot(survfit(cc) ,col=1:4,mark.time=FALSE)
}
##################################################
### Categorical predictor
##################################################
m <- lvm()
## categorical(m,K=3) <- "v"
categorical(m,labels=c("A","B","C")) <- "v"
regression(m,additive=FALSE) <- y~v
if (FALSE) { # \dontrun{
plot(y~v,sim(m,1000,p=c("y~v:2"=3)))
} # }
m <- lvm()
categorical(m,labels=c("A","B","C"),p=c(0.5,0.3)) <- "v"
regression(m,additive=FALSE,beta=c(0,2,-1)) <- y~v
## equivalent to:
## regression(m,y~v,additive=FALSE) <- c(0,2,-1)
regression(m,additive=FALSE,beta=c(0,4,-1)) <- z~v
table(sim(m,1e4)$v)
#>
#> A B C
#> 5006 3017 1977
glm(y~v, data=sim(m,1e4))
#>
#> Call: glm(formula = y ~ v, data = sim(m, 10000))
#>
#> Coefficients:
#> (Intercept) vB vC
#> 0.02251 2.00064 -1.00417
#>
#> Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual
#> Null Deviance: 22620
#> Residual Deviance: 9990 AIC: 28380
glm(y~v, data=sim(m,1e4,p=c("y~v:1"=3)))
#>
#> Call: glm(formula = y ~ v, data = sim(m, 10000, p = c(`y~v:1` = 3)))
#>
#> Coefficients:
#> (Intercept) vB vC
#> 0.003158 2.986700 -1.011000
#>
#> Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual
#> Null Deviance: 34540
#> Residual Deviance: 10350 AIC: 28730
transform(m,v2~v) <- function(x) x=='A'
sim(m,10)
#> v y z v2
#> 1 B 2.46056403 2.8858782 FALSE
#> 2 B 2.13354634 4.1726117 FALSE
#> 3 B 2.83047801 5.0140621 FALSE
#> 4 A 0.61279199 0.1483464 TRUE
#> 5 A -0.10165782 0.5226807 TRUE
#> 6 A 1.00767457 -0.6966552 TRUE
#> 7 A -0.66799846 -0.9295116 TRUE
#> 8 A 0.87462526 -1.6325467 TRUE
#> 9 A 0.00129238 -1.1372980 TRUE
#> 10 C -2.20096227 0.5693128 FALSE
##################################################
### Pre-calculate object
##################################################
m <- lvm(y~x)
m2 <- sim(m,'y~x'=2)
sim(m,10,'y~x'=2)
#> y x
#> 1 1.2233025 0.5982027
#> 2 -1.9169139 -0.1868558
#> 3 1.6462627 -0.1377385
#> 4 -1.2117513 -0.4254388
#> 5 3.0226593 1.6894790
#> 6 -3.9541658 -1.5825774
#> 7 0.8057423 0.2298156
#> 8 -2.1578435 -1.3413077
#> 9 1.8430087 0.6336803
#> 10 1.1745427 -0.2815957
sim(m2,10) ## Faster
#> y x
#> 1 -4.625097 -2.2077901
#> 2 -3.239577 -2.4106877
#> 3 2.582984 0.8197093
#> 4 2.435479 0.9182792
#> 5 3.031488 1.2826166
#> 6 -2.039096 -1.3749699
#> 7 -3.497009 -1.8324686
#> 8 2.049103 1.1580864
#> 9 -1.319555 0.2221127
#> 10 -5.157303 -1.4383467
