Introduction

The likelihood.model package provides a flexible framework for specifying and using likelihood models for statistical inference in R. It supports: - Standard R distributions (normal, Weibull, exponential, etc.) - Censored data (left, right, and interval censoring) - Custom likelihood contributions for complex models - Maximum likelihood estimation with confidence intervals - Bootstrap sampling distributions

This vignette provides a quick introduction to the main features.

Installation

# Install from GitHub
if (!require(devtools)) install.packages("devtools")
devtools::install_github("queelius/likelihood.model")

Loading Packages

Example 1: Basic MLE with Weibull Distribution

Let’s start with a simple example: fitting a Weibull distribution to survival data using the weibull_uncensored model, which provides analytical derivatives for faster and more accurate estimation.

# Generate synthetic survival data
set.seed(42)
n <- 150
true_shape <- 2.5
true_scale <- 3.0
df <- data.frame(x = rweibull(n, shape = true_shape, scale = true_scale))

# Create the likelihood model
model <- weibull_uncensored("x")

# View model assumptions
assumptions(model)
#> [1] "independent"             "identically distributed"
#> [3] "exact observations"

# Fit the MLE
solver <- fit(model)
mle <- solver(df, par = c(1, 1))  # initial guess: shape=1, scale=1

# View results
summary(mle)
#> Maximum Likelihood Estimate (Fisherian)
#> ----------------------------------------
#> 
#> Coefficients:
#>      Estimate Std. Error  2.5% 97.5%
#> [1,]    2.152      0.131 1.896 2.409
#> [2,]    2.916      0.117 2.686 3.145
#> 
#> Log-likelihood: -238.7 
#> AIC: 481.4 
#> Number of observations: 150

Extracting Results

# Parameter estimates
cat("Estimated parameters:\n")
#> Estimated parameters:
print(coef(mle))
#> [1] 2.152 2.916

# Confidence intervals (Wald-based)
cat("\n95% Confidence Intervals:\n")
#> 
#> 95% Confidence Intervals:
print(confint(mle))
#>       2.5% 97.5%
#> [1,] 1.896 2.409
#> [2,] 2.686 3.145

# Standard errors
cat("\nStandard Errors:\n")
#> 
#> Standard Errors:
print(se(mle))
#> [1] 0.131 0.117

# Log-likelihood value
cat("\nLog-likelihood:", loglik_val(mle), "\n")
#> 
#> Log-likelihood: -238.7

# AIC for model selection
cat("AIC:", aic(mle), "\n")
#> AIC: 481.4

Example 2: Handling Censored Data

A key feature of likelihood.model is proper handling of censored data. Let’s simulate data with right-censoring and show how ignoring censoring leads to biased estimates.

# Generate normal data with right-censoring
set.seed(123)
n <- 200
true_mean <- 10
true_sd <- 2
censor_time <- 11

# Generate latent (true) values
x_latent <- rnorm(n, true_mean, true_sd)

# Apply censoring
censored <- x_latent > censor_time
df_cens <- data.frame(
  x = ifelse(censored, censor_time, x_latent),
  censor = ifelse(censored, "right", "exact")
)

cat("Censoring rate:", mean(censored) * 100, "%\n")
#> Censoring rate: 27.5 %

Fitting with Proper Censoring

# Create model that handles censoring
model_cens <- likelihood_name("norm", ob_col = "x", censor_col = "censor")

# Fit the model
solver_cens <- fit(model_cens)
mle_cens <- suppressWarnings(solver_cens(df_cens, par = c(0, 1)))

cat("MLE (accounting for censoring):\n")
#> MLE (accounting for censoring):
cat("  Mean:", coef(mle_cens)[1], "(true:", true_mean, ")\n")
#>   Mean: 9.912 (true: 10 )
cat("  SD:  ", coef(mle_cens)[2], "(true:", true_sd, ")\n")
#>   SD:   1.754 (true: 2 )

Comparison: Naive vs Proper Estimation

# Naive estimates (ignoring censoring)
naive_mean <- mean(df_cens$x)
naive_sd <- sd(df_cens$x)

cat("\nComparison of estimates:\n")
#> 
#> Comparison of estimates:
cat("                     Mean      SD\n")
#>                      Mean      SD
cat(sprintf("True values:      %7.3f   %5.3f\n", true_mean, true_sd))
#> True values:       10.000   2.000
cat(sprintf("MLE (correct):    %7.3f   %5.3f\n", coef(mle_cens)[1], coef(mle_cens)[2]))
#> MLE (correct):      9.912   1.754
cat(sprintf("Naive (biased):   %7.3f   %5.3f\n", naive_mean, naive_sd))
#> Naive (biased):     9.617   1.353
cat("\nNote: Naive estimates are biased downward due to censoring!\n")
#> 
#> Note: Naive estimates are biased downward due to censoring!

Example 3: Bootstrap Sampling Distribution

For more robust inference, especially with small samples, we can use bootstrap methods to estimate the sampling distribution of the MLE.

# Using the Weibull example from before
model <- weibull_uncensored("x")
df <- data.frame(x = rweibull(150, shape = 2.5, scale = 3.0))

# Create bootstrap sampler
boot_sampler <- sampler(model, df = df, par = c(1, 1))

# Generate bootstrap samples (100 replicates for speed)
boot_result <- boot_sampler(n = 100)

# Compare asymptotic vs bootstrap standard errors
mle <- fit(model)(df, par = c(1, 1))
asymp_se <- se(mle)
boot_se <- se(boot_result)  # Uses vcov from bootstrap replicates

cat("Standard Error Comparison:\n")
#> Standard Error Comparison:
cat("           Asymptotic  Bootstrap\n")
#>            Asymptotic  Bootstrap
cat(sprintf("Shape:     %10.4f  %9.4f\n", asymp_se[1], boot_se[1]))
#> Shape:         0.1623     0.2150
cat(sprintf("Scale:     %10.4f  %9.4f\n", asymp_se[2], boot_se[2]))
#> Scale:         0.1026     0.0989

# Bootstrap bias estimate
cat("\nBootstrap Bias Estimate:\n")
#> 
#> Bootstrap Bias Estimate:
print(bias(boot_result))
#> [1] 0.031600 0.007698

Example 4: Verifying the Score at MLE

At the MLE, the score function (gradient of log-likelihood) should be approximately zero. This provides a useful diagnostic.

model <- weibull_uncensored("x")
df <- data.frame(x = rweibull(100, 2, 1.5))

# Fit MLE
mle <- fit(model)(df, par = c(1, 1))

# Evaluate score at MLE
score_func <- score(model)
score_at_mle <- score_func(df, coef(mle))

cat("Score at MLE (should be near zero):\n")
#> Score at MLE (should be near zero):
print(score_at_mle)
#>    shape    scale 
#> -0.00338  0.01183

# The score is also stored in the MLE object
cat("\nScore stored in MLE object:\n")
#> 
#> Score stored in MLE object:
print(score_val(mle))
#>    shape    scale 
#> -0.00338  0.01183

The score values are very close to zero, confirming that the optimizer successfully found the MLE (a stationary point of the log-likelihood).

Example 5: Fisherian Likelihood Inference

The package supports pure likelihood-based inference in the Fisherian tradition. Instead of confidence intervals, we can compute likelihood intervals that make no probability statements about parameters.

# Fit a model
model <- weibull_uncensored("x")
df <- data.frame(x = rweibull(100, 2.0, 1.5))
mle <- fit(model)(df, par = c(1, 1))

# Support function: log relative likelihood
# S(theta) = logL(theta) - logL(theta_hat)
# Support at MLE is always 0
s_at_mle <- support(mle, coef(mle), df, model)
cat("Support at MLE:", s_at_mle, "\n")
#> Support at MLE: 0

# Support at alternative parameter values (negative = less supported)
s_alt <- support(mle, c(1.5, 1.0), df, model)
cat("Support at (1.5, 1.0):", s_alt, "\n")
#> Support at (1.5, 1.0): -18.64

# Relative likelihood = exp(support)
rl <- relative_likelihood(mle, c(1.5, 1.0), df, model)
cat("Relative likelihood at (1.5, 1.0):", rl, "\n")
#> Relative likelihood at (1.5, 1.0): 8.011e-09

Negative support and a very small relative likelihood indicate that the alternative parameter values (1.5,1.0)(1.5, 1.0) are poorly supported by the data compared to the MLE.

Likelihood Intervals

# Compute 1/8 likelihood interval (roughly equivalent to 95% CI)
# This is the set of theta where R(theta) >= 1/8
li <- likelihood_interval(mle, df, model, k = 8)
print(li)
#> 1/8 Likelihood Interval (R >= 0.125)
#> -----------------------------------
#>      lower upper
#> [1,] 1.646 2.281
#> [2,] 1.299 1.623
#> attr(,"k")
#> [1] 8
#> attr(,"relative_likelihood_cutoff")
#> [1] 0.125

# Compare with Wald confidence interval
cat("\nWald 95% CI:\n")
#> 
#> Wald 95% CI:
print(confint(mle))
#>       2.5% 97.5%
#> [1,] 1.644 2.255
#> [2,] 1.301 1.608

Example 6: Closed-Form MLE with Exponential Lifetime Model

The exponential_lifetime model demonstrates a key design feature: specialized models can override fit() to bypass optim entirely when the MLE has a closed-form solution.

For Exponential(lambda) data, the MLE is simply lambda_hat = d / T, where d is the number of uncensored observations and T is the total observation time.

# Generate exponential survival data
set.seed(99)
df_exp <- data.frame(t = rexp(200, rate = 3.0))

# Create model and fit -- no initial guess needed!
model_exp <- exponential_lifetime("t")
mle_exp <- fit(model_exp)(df_exp)

cat("Closed-form MLE (no optim):\n")
#> Closed-form MLE (no optim):
cat("  lambda_hat:", coef(mle_exp), "(true: 3.0)\n")
#>   lambda_hat: 3.137 (true: 3.0)
cat("  SE:", se(mle_exp), "\n")
#>   SE: 0.2218

# The score at the MLE is exactly zero (by construction)
cat("  Score at MLE:", score_val(mle_exp), "\n")
#>   Score at MLE: 0

Right-Censored Exponential Data

The model handles right-censoring naturally through the sufficient statistics.

# Generate data with right-censoring at time 0.5
set.seed(99)
true_lambda <- 3.0
x <- rexp(200, rate = true_lambda)
censored <- x > 0.5
df_cens_exp <- data.frame(
  t = pmin(x, 0.5),
  status = ifelse(censored, "right", "exact")
)

cat("Censoring rate:", mean(censored) * 100, "%\n")
#> Censoring rate: 20.5 %

model_cens_exp <- exponential_lifetime("t", censor_col = "status")
mle_cens_exp <- fit(model_cens_exp)(df_cens_exp)

cat("MLE (censored):", coef(mle_cens_exp), "(true:", true_lambda, ")\n")
#> MLE (censored): 3.336 (true: 3 )
cat("95% CI:", confint(mle_cens_exp)[1, ], "\n")
#> 95% CI: 2.817 3.854

Cross-Validation: Matches likelihood_name("exp", ...)

The analytical model produces the same log-likelihood as the generic approach:

# Compare log-likelihood values at the MLE
lambda_hat <- coef(mle_exp)
ll_analytical <- loglik(model_exp)(df_exp, lambda_hat)

# likelihood_name("exp") uses dexp(x, rate), so pass unnamed parameter
df_exp_name <- data.frame(t = df_exp$t, censor = rep("exact", nrow(df_exp)))
model_generic <- likelihood_name("exp", ob_col = "t", censor_col = "censor")
ll_generic <- loglik(model_generic)(df_exp_name, unname(lambda_hat))

cat("Analytical loglik:", ll_analytical, "\n")
#> Analytical loglik: 28.66
cat("Generic loglik:   ", ll_generic, "\n")
#> Generic loglik:    28.66
cat("Match:", isTRUE(all.equal(unname(ll_analytical), ll_generic, tolerance = 1e-6)), "\n")
#> Match: TRUE

Summary of Key Functions

Model Creation

Function Description
likelihood_name() Create model for standard R distributions
weibull_uncensored() Weibull model with analytical derivatives (exact obs.)
exponential_lifetime() Exponential model with closed-form MLE and optional censoring
likelihood_contr_model$new() Custom likelihood contributions

Likelihood Calculus

Function Description
loglik() Get log-likelihood function
score() Get score (gradient) function
hess_loglik() Get Hessian of log-likelihood
fim() Fisher information matrix
observed_info() Observed information matrix

Estimation and Inference

Function Description
fit() Create MLE solver (returns fisher_mle)
sampler() Create bootstrap sampler (returns fisher_boot)
coef() Extract parameter estimates
vcov() Variance-covariance matrix
confint() Confidence intervals
se() Standard errors

Fisherian Inference

Function Description
support() Log relative likelihood: S(θ) = logL(θ) - logL(θ̂)
relative_likelihood() Likelihood ratio: R(θ) = L(θ)/L(θ̂)
likelihood_interval() Interval where R(θ) ≥ 1/k
profile_loglik() Profile log-likelihood
evidence() Evidence for θ₁ vs θ₂

Model Comparison

Function Description
lrt() Likelihood ratio test
aic() Akaike Information Criterion
bic() Bayesian Information Criterion
deviance() Deviance statistic

Next Steps

For more advanced usage, see the other vignettes:

  • Likelihood Named Distributions Model: Detailed coverage of likelihood_name with various censoring types
  • Likelihood Contributions Model: Building custom models for complex scenarios like series systems with component failures

Session Info

sessionInfo()
#> R version 4.3.3 (2024-02-29)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 24.04.3 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
#>  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
#>  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
#> [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
#> 
#> time zone: America/Chicago
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] likelihood.model_0.9.1
#> 
#> loaded via a namespace (and not attached):
#>  [1] cli_3.6.5            knitr_1.50           rlang_1.1.7         
#>  [4] xfun_0.54            generics_0.1.4       textshaping_1.0.4   
#>  [7] jsonlite_2.0.0       htmltools_0.5.8.1    ragg_1.5.0          
#> [10] sass_0.4.10          rmarkdown_2.30       evaluate_1.0.5      
#> [13] jquerylib_0.1.4      MASS_7.3-60.0.1      fastmap_1.2.0       
#> [16] numDeriv_2016.8-1.1  yaml_2.3.11          mvtnorm_1.3-3       
#> [19] lifecycle_1.0.5      algebraic.mle_0.9.0  compiler_4.3.3      
#> [22] fs_1.6.6             htmlwidgets_1.6.4    algebraic.dist_0.1.0
#> [25] systemfonts_1.3.1    digest_0.6.39        R6_2.6.1            
#> [28] bslib_0.9.0          tools_4.3.3          boot_1.3-32         
#> [31] pkgdown_2.2.0        cachem_1.1.0         desc_1.4.3