Simulate data from a general SEM model including non-linear effects and general link and distribution of variables.
Usage
# S3 method for class 'lvm'
sim(x, n = NULL, p = NULL, normal = FALSE, cond = FALSE,
sigma = 1, rho = 0.5, X = NULL, unlink=FALSE, latent=TRUE,
use.labels = TRUE, seed=NULL, ...)Arguments
- x
Model object
- n
Number of simulated values/individuals
- p
Parameter value (optional)
- normal
Logical indicating whether to simulate data from a multivariate normal distribution conditional on exogenous variables hence ignoring functional/distribution definition
- cond
for internal use
- sigma
Default residual variance (1)
- rho
Default covariance parameter (0.5)
- X
Optional matrix of fixed values of variables (manipulation)
- unlink
Return Inverse link transformed data
- latent
Include latent variables (default TRUE)
- use.labels
convert categorical variables to factors before applying transformation
- seed
Random seed
- ...
Additional arguments to be passed to the low level functions
Examples
##################################################
## Logistic regression
##################################################
m <- lvm(y~x+z)
regression(m) <- x~z
distribution(m,~y+z) <- binomial.lvm("logit")
d <- sim(m,1e3)
head(d)
#> y x z
#> 1 1 -0.3890521 0
#> 2 1 1.5760916 1
#> 3 1 1.5709196 1
#> 4 1 1.4971902 1
#> 5 1 1.3515534 1
#> 6 1 0.6453046 0
e <- estimate(m,d,estimator="glm")
e
#> Estimate Std. Error Z-value P-value
#> Regressions:
#> y~x 1.06972 0.09349 11.44259 <1e-12
#> y~z 1.01578 0.16776 6.05499 1.404e-09
#> x~z 0.96477 0.06532 14.77041 <1e-12
#> Intercepts:
#> y -0.13044 0.10078 -1.29431 0.1956
#> x 0.02210 0.04701 0.47012 0.6383
#> Dispersion:
#> x 1.06874
## Simulate a few observation from estimated model
sim(e,n=5)
#> y x z
#> 1 1 0.7578086 1
#> 2 0 0.6592641 1
#> 3 1 1.2263657 1
#> 4 1 0.2300801 0
#> 5 1 0.9931559 1
##################################################
## Poisson
##################################################
distribution(m,~y) <- poisson.lvm()
d <- sim(m,1e4,p=c(y=-1,"y~x"=2,z=1))
head(d)
#> y x z
#> 1 14 1.01864520 1
#> 2 1 0.09862234 0
#> 3 0 -0.28820655 0
#> 4 19 1.53349369 1
#> 5 0 0.46584650 0
#> 6 6 1.12191083 1
estimate(m,d,estimator="glm")
#> Estimate Std. Error Z-value P-value
#> Regressions:
#> y~x 1.99880 0.00192 1042.74787 <1e-12
#> y~z 0.98743 0.01116 88.49805 <1e-12
#> x~z 0.99262 0.02273 43.66784 <1e-12
#> Intercepts:
#> y -0.98504 0.01167 -84.44281 <1e-12
#> x 0.00358 0.01950 0.18345 0.8544
#> Dispersion:
#> x 1.00426
mean(d$z); lava:::expit(1)
#> [1] 0.7251
#> [1] 0.7310586
summary(lm(y~x,sim(lvm(y[1:2]~4*x),1e3)))
#>
#> Call:
#> lm(formula = y ~ x, data = sim(lvm(y[1:2] ~ 4 * x), 1000))
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -4.2216 -1.0570 0.0261 0.9492 4.5983
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 0.99738 0.04548 21.93 <2e-16 ***
#> x 3.99724 0.04426 90.31 <2e-16 ***
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 1.438 on 998 degrees of freedom
#> Multiple R-squared: 0.891, Adjusted R-squared: 0.8909
#> F-statistic: 8156 on 1 and 998 DF, p-value: < 2.2e-16
#>
##################################################
### Gamma distribution
##################################################
m <- lvm(y~x)
distribution(m,~y+x) <- list(Gamma.lvm(shape=2),binomial.lvm())
intercept(m,~y) <- 0.5
d <- sim(m,1e4)
summary(g <- glm(y~x,family=Gamma(),data=d))
#>
#> Call:
#> glm(formula = y ~ x, family = Gamma(), data = d)
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 0.505874 0.005102 99.15 <2e-16 ***
#> x 1.008180 0.016302 61.84 <2e-16 ***
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> (Dispersion parameter for Gamma family taken to be 0.5156528)
#>
#> Null deviance: 8397.4 on 9999 degrees of freedom
#> Residual deviance: 5546.3 on 9998 degrees of freedom
#> AIC: 20678
#>
#> Number of Fisher Scoring iterations: 6
#>
if (FALSE) MASS::gamma.shape(g) # \dontrun{}
args(lava::Gamma.lvm)
#> function (link = "inverse", shape, rate, unit = FALSE, var = FALSE,
#> log = FALSE, ...)
#> NULL
distribution(m,~y) <- Gamma.lvm(shape=2,log=TRUE)
sim(m,10,p=c(y=0.5))[,"y"]
#> [1] -1.14993675 0.06110529 -0.78986992 1.68142084 0.13917097 -0.35847861
#> [7] 0.73652104 -1.62401234 -0.39977265 -1.16007252
##################################################
### Beta
##################################################
m <- lvm()
distribution(m,~y) <- beta.lvm(alpha=2,beta=1)
var(sim(m,100,"y,y"=2))
#> y
#> y 1.14993
distribution(m,~y) <- beta.lvm(alpha=2,beta=1,scale=FALSE)
var(sim(m,100))
#> y
#> y 0.0521512
##################################################
### Transform
##################################################
m <- lvm()
transform(m,xz~x+z) <- function(x) x[1]*(x[2]>0)
regression(m) <- y~x+z+xz
d <- sim(m,1e3)
summary(lm(y~x+z + x*I(z>0),d))
#>
#> Call:
#> lm(formula = y ~ x + z + x * I(z > 0), data = d)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -3.1528 -0.6738 -0.0486 0.7210 3.6658
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 0.021540 0.061655 0.349 0.727
#> x 0.987755 0.047652 20.728 <2e-16 ***
#> z 1.016270 0.054549 18.630 <2e-16 ***
#> I(z > 0)TRUE -0.005932 0.106185 -0.056 0.955
#> x:I(z > 0)TRUE 1.009156 0.065653 15.371 <2e-16 ***
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 1.018 on 995 degrees of freedom
#> Multiple R-squared: 0.7704, Adjusted R-squared: 0.7694
#> F-statistic: 834.5 on 4 and 995 DF, p-value: < 2.2e-16
#>
##################################################
### Non-random variables
##################################################
m <- lvm()
distribution(m,~x+z+v+w) <- list(Sequence.lvm(0,5),## Seq. 0 to 5 by 1/n
Binary.lvm(), ## Vector of ones
Binary.lvm(0.5), ## 0.5n 0, 0.5n 1
Binary.lvm(interval=list(c(0.3,0.5),c(0.8,1))))
sim(m,10)
#> x z v w
#> 1 0.0000000 1 0 0
#> 2 0.5555556 1 0 0
#> 3 1.1111111 1 0 1
#> 4 1.6666667 1 0 1
#> 5 2.2222222 1 0 1
#> 6 2.7777778 1 1 0
#> 7 3.3333333 1 1 0
#> 8 3.8888889 1 1 1
#> 9 4.4444444 1 1 1
#> 10 5.0000000 1 1 1
##################################################
### Cox model
### piecewise constant hazard
################################################
m <- lvm(t~x)
rates <- c(1,0.5); cuts <- c(0,5)
## Constant rate: 1 in [0,5), 0.5 in [5,Inf)
distribution(m,~t) <- coxExponential.lvm(rate=rates,timecut=cuts)
if (FALSE) { # \dontrun{
d <- sim(m,2e4,p=c("t~x"=0.1)); d$status <- TRUE
plot(timereg::aalen(survival::Surv(t,status)~x,data=d,
resample.iid=0,robust=0),spec=1)
L <- approxfun(c(cuts,max(d$t)),f=1,
cumsum(c(0,rates*diff(c(cuts,max(d$t))))),
method="linear")
curve(L,0,100,add=TRUE,col="blue")
} # }
##################################################
### Cox model
### piecewise constant hazard, gamma frailty
##################################################
m <- lvm(y~x+z)
rates <- c(0.3,0.5); cuts <- c(0,5)
distribution(m,~y+z) <- list(coxExponential.lvm(rate=rates,timecut=cuts),
loggamma.lvm(rate=1,shape=1))
if (FALSE) { # \dontrun{
d <- sim(m,2e4,p=c("y~x"=0,"y~z"=0)); d$status <- TRUE
plot(timereg::aalen(survival::Surv(y,status)~x,data=d,
resample.iid=0,robust=0),spec=1)
L <- approxfun(c(cuts,max(d$y)),f=1,
cumsum(c(0,rates*diff(c(cuts,max(d$y))))),
method="linear")
curve(L,0,100,add=TRUE,col="blue")
} # }
## Equivalent via transform (here with Aalens additive hazard model)
m <- lvm(y~x)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- Gamma.lvm(rate=1,shape=1)
transform(m,t~y+z) <- prod
sim(m,10)
#> y x z t
#> 1 -7.52089416 -0.3975626 0.0521017141 -3.918515e-01
#> 2 2.20571704 0.3470455 0.5086850008 1.122015e+00
#> 3 -4.36804686 -0.3792286 0.5905137765 -2.579392e+00
#> 4 -0.07640396 -1.5337391 0.2058743050 -1.572961e-02
#> 5 21.43594787 -0.2640809 0.0099792760 2.139152e-01
#> 6 4.25372148 0.1839688 0.4443781625 1.890261e+00
#> 7 -0.09900900 -1.4246557 0.0006665787 -6.599729e-05
#> 8 -1.15906345 -1.0335661 0.8045995309 -9.325819e-01
#> 9 6.72402679 -0.2305547 0.6546673730 4.402001e+00
#> 10 5.53307450 -0.1962768 0.2215395973 1.225795e+00
## Shared frailty
m <- lvm(c(t1,t2)~x+z)
rates <- c(1,0.5); cuts <- c(0,5)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- loggamma.lvm(rate=1,shape=1)
if (FALSE) { # \dontrun{
mets::fast.reshape(sim(m,100),varying="t")
} # }
##################################################
### General multivariate distributions
##################################################
if (FALSE) { # \dontrun{
m <- lvm()
distribution(m,~y1+y2,oratio=4) <- VGAM::rbiplackcop
ksmooth2(sim(m,1e4),rgl=FALSE,theta=-20,phi=25)
m <- lvm()
distribution(m,~z1+z2,"or1") <- VGAM::rbiplackcop
distribution(m,~y1+y2,"or2") <- VGAM::rbiplackcop
sim(m,10,p=c(or1=0.1,or2=4))
} # }
m <- lvm()
distribution(m,~y1+y2+y3,TRUE) <- function(n,...) rmvn0(n,sigma=diag(3)+1)
var(sim(m,100))
#> y1 y2 y3
#> y1 2.546658 1.2446886 1.1131815
#> y2 1.244689 2.1601392 0.8173183
#> y3 1.113182 0.8173183 1.8257718
## Syntax also useful for univariate generators, e.g.
m <- lvm(y~x+z)
distribution(m,~y,TRUE) <- function(n) rnorm(n,mean=1000)
sim(m,5)
#> y x z
#> 1 1000.5214 0.2177780 1.93296840
#> 2 999.1417 -0.4748423 0.10528753
#> 3 999.1108 -2.2866782 -0.09999915
#> 4 998.8764 -0.7593130 -0.13847376
#> 5 1001.0689 -0.3586061 0.24695597
distribution(m,~y,"m1",0) <- rnorm
sim(m,5)
#> y x z
#> 1 -0.5899834 0.5347800 -2.3598938
#> 2 -0.4243369 -0.9962041 -0.5107305
#> 3 3.5456795 1.7645527 1.5297775
#> 4 -1.4880155 -0.7726338 -0.1372156
#> 5 -0.2474045 0.3482477 -0.1443716
sim(m,5,p=c(m1=100))
#> y x z
#> 1 102.25740 0.129915309 2.7875521
#> 2 101.53418 1.820998862 -0.3153138
#> 3 98.71054 -0.007577379 -0.2353492
#> 4 99.17310 -0.901698372 -0.4610830
#> 5 103.29305 0.072049531 1.9003038
##################################################
### Regression design in other parameters
##################################################
## Variance heterogeneity
m <- lvm(y~x)
distribution(m,~y) <- function(n,mean,x) rnorm(n,mean,exp(x)^.5)
if (interactive()) plot(y~x,sim(m,1e3))
## Alternaively, calculate the standard error directly
addvar(m) <- ~sd ## If 'sd' should be part of the resulting data.frame
constrain(m,sd~x) <- function(x) exp(x)^.5
distribution(m,~y) <- function(n,mean,sd) rnorm(n,mean,sd)
if (interactive()) plot(y~x,sim(m,1e3))
## Regression on variance parameter
m <- lvm()
regression(m) <- y~x
regression(m) <- v~x
##distribution(m,~v) <- 0 # No stochastic term
## Alternative:
## regression(m) <- v[NA:0]~x
distribution(m,~y) <- function(n,mean,v) rnorm(n,mean,exp(v)^.5)
if (interactive()) plot(y~x,sim(m,1e3))
## Regression on shape parameter in Weibull model
m <- lvm()
regression(m) <- y ~ z+v
regression(m) <- s ~ exp(0.6*x-0.5*z)
distribution(m,~x+z) <- binomial.lvm()
distribution(m,~cens) <- coxWeibull.lvm(scale=1)
distribution(m,~y) <- coxWeibull.lvm(scale=0.1,shape=~s)
eventTime(m) <- time ~ min(y=1,cens=0)
if (interactive()) {
d <- sim(m,1e3)
require(survival)
(cc <- coxph(Surv(time,status)~v+strata(x,z),data=d))
plot(survfit(cc) ,col=1:4,mark.time=FALSE)
}
##################################################
### Categorical predictor
##################################################
m <- lvm()
## categorical(m,K=3) <- "v"
categorical(m,labels=c("A","B","C")) <- "v"
regression(m,additive=FALSE) <- y~v
if (FALSE) { # \dontrun{
plot(y~v,sim(m,1000,p=c("y~v:2"=3)))
} # }
m <- lvm()
categorical(m,labels=c("A","B","C"),p=c(0.5,0.3)) <- "v"
regression(m,additive=FALSE,beta=c(0,2,-1)) <- y~v
## equivalent to:
## regression(m,y~v,additive=FALSE) <- c(0,2,-1)
regression(m,additive=FALSE,beta=c(0,4,-1)) <- z~v
table(sim(m,1e4)$v)
#>
#> A B C
#> 5004 3018 1978
glm(y~v, data=sim(m,1e4))
#>
#> Call: glm(formula = y ~ v, data = sim(m, 10000))
#>
#> Coefficients:
#> (Intercept) vB vC
#> 0.02492 1.99774 -1.00229
#>
#> Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual
#> Null Deviance: 22580
#> Residual Deviance: 9997 AIC: 28380
glm(y~v, data=sim(m,1e4,p=c("y~v:1"=3)))
#>
#> Call: glm(formula = y ~ v, data = sim(m, 10000, p = c(`y~v:1` = 3)))
#>
#> Coefficients:
#> (Intercept) vB vC
#> 0.003437 2.987178 -1.009686
#>
#> Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual
#> Null Deviance: 34490
#> Residual Deviance: 10330 AIC: 28710
transform(m,v2~v) <- function(x) x=='A'
sim(m,10)
#> v y z v2
#> 1 B 2.2298156 4.1679282 FALSE
#> 2 B 0.6586923 3.7329303 FALSE
#> 3 B 2.6336803 2.2362200 FALSE
#> 4 C -1.2815957 -3.2806097 FALSE
#> 5 B 1.7904833 1.7922099 FALSE
#> 6 A 1.5817981 -2.4106877 TRUE
#> 7 A 0.9435654 0.8197093 TRUE
#> 8 A 0.5989207 0.9182792 TRUE
#> 9 C -0.5337450 0.2826166 FALSE
#> 10 A 0.7108434 -1.3749699 TRUE
##################################################
### Pre-calculate object
##################################################
m <- lvm(y~x)
m2 <- sim(m,'y~x'=2)
sim(m,10,'y~x'=2)
#> y x
#> 1 -2.9969686 -0.5822500
#> 2 2.0932122 0.4675629
#> 3 2.4677225 1.1228049
#> 4 1.3229557 1.3806512
#> 5 3.1524416 1.9010546
#> 6 -0.6734257 -0.6149359
#> 7 0.4034541 -0.4402311
#> 8 0.5278260 -0.2312745
#> 9 1.3184855 0.7196587
#> 10 -0.6764807 -0.2205306
sim(m2,10) ## Faster
#> y x
#> 1 0.2093413 -0.1414400
#> 2 1.7423583 0.9679083
#> 3 2.3735490 0.8827087
#> 4 -2.9847372 -1.3114255
#> 5 0.8275647 -0.2983199
#> 6 -3.4592201 -1.8890341
#> 7 2.4341144 0.9988011
#> 8 -1.1049947 -0.4407522
#> 9 -0.4269766 -0.1266534
#> 10 -1.4520110 -0.6076527
