Getting Started with numerical.mle

Introduction

The numerical.mle package provides numerical optimization algorithms for maximum likelihood estimation (MLE). It offers a unified, composable API with:

Core solvers: Gradient ascent and Newton-Raphson optimization
Meta-solvers: Grid search and random restart for global optimization
Function transformers: Subsampling for stochastic gradient ascent, penalty terms for regularization
Type-safe configuration: Explicit configuration objects for convergence criteria

Installation

# Install from GitHub
devtools::install_github("queelius/numerical.mle")

library(numerical.mle)
#> Registered S3 method overwritten by 'algebraic.dist':
#>   method     from 
#>   print.dist stats

Quick Start

Basic Example: Normal Distribution MLE

Let’s estimate the mean and standard deviation of normally distributed data:

# Generate sample data
set.seed(123)
data <- rnorm(100, mean = 5, sd = 2)

# Define log-likelihood
loglike <- function(theta) {
  mu <- theta[1]
  sigma <- theta[2]
  if (sigma <= 0) return(-Inf)
  sum(dnorm(data, mean = mu, sd = sigma, log = TRUE))
}

# Define score function (gradient of log-likelihood)
score <- function(theta) {
  mu <- theta[1]
  sigma <- theta[2]
  n <- length(data)
  c(
    sum(data - mu) / sigma^2,
    -n / sigma + sum((data - mu)^2) / sigma^3
  )
}

# Constrain sigma to be positive
constraint <- mle_constraint(
  support = function(theta) theta[2] > 0,
  project = function(theta) c(theta[1], max(theta[2], 1e-6))
)

# Fit using gradient ascent with line search
result <- mle_gradient_ascent(
  loglike = loglike,
  score = score,
  theta0 = c(0, 1),
  config = mle_config_linesearch(max_iter = 100),
  constraint = constraint
)

# View results
cat("Estimated mean:", result$theta.hat[1], "(true: 5)\n")
#> Estimated mean: 5.180485 (true: 5)
cat("Estimated sd:", result$theta.hat[2], "(true: 2)\n")
#> Estimated sd: 1.81624 (true: 2)
cat("Sample mean:", mean(data), "\n")
#> Sample mean: 5.180812
cat("Sample sd:", sd(data), "\n")
#> Sample sd: 1.825632

Configuration Objects

The package uses type-safe configuration objects to control optimization behavior.

Base Configuration

# Basic configuration with convergence criteria
config <- mle_config(
  max_iter = 200,    # Maximum iterations
  rel_tol = 1e-6,    # Relative tolerance for convergence
  trace = FALSE,     # Store optimization path?
  debug = FALSE      # Print debug output?
)

Gradient Configuration

For gradient-based methods, you can specify the learning rate:

config_grad <- mle_config_gradient(
  eta = 0.1,        # Learning rate / step size
  max_iter = 100,
  rel_tol = 1e-5
)

Line Search Configuration

Line search adaptively finds good step sizes (recommended for most problems):

config_ls <- mle_config_linesearch(
  max_step = 1.0,        # Maximum step size
  backtrack_ratio = 0.5, # Step reduction factor
  max_iter = 100,
  rel_tol = 1e-5
)

Core Solvers

Gradient Ascent

Gradient ascent uses the score function (gradient of log-likelihood) to iteratively climb toward the maximum:

# Simple quadratic problem: maximize -(x^2 + y^2)
loglike <- function(theta) -(theta[1]^2 + theta[2]^2)
score <- function(theta) -2 * theta

result <- mle_gradient_ascent(
  loglike = loglike,
  score = score,
  theta0 = c(5, 5),
  config = mle_config_linesearch(max_iter = 50)
)

cat("Solution:", result$theta.hat, "\n")
#> Solution: 0.0005340861 0.0005340861
cat("Iterations:", result$iter, "\n")
#> Iterations: 12

Newton-Raphson

Newton-Raphson uses second-order information (Fisher information matrix) for faster convergence:

# Same problem with Newton-Raphson
fisher <- function(theta) matrix(c(2, 0, 0, 2), nrow = 2)

result_nr <- mle_newton_raphson(
  loglike = loglike,
  score = score,
  fisher = fisher,
  theta0 = c(5, 5),
  config = mle_config_linesearch(max_iter = 20)
)

cat("Solution:", result_nr$theta.hat, "\n")
#> Solution: 0.0005340861 0.0005340861
cat("Iterations:", result_nr$iter, "\n")
#> Iterations: 12

Convenience Wrappers

For quick prototyping, use the convenience wrappers:

# Quick gradient ascent
result <- mle_grad(loglike, score, theta0 = c(5, 5), max_iter = 50)

# Quick Newton-Raphson
result <- mle_nr(loglike, score, fisher, theta0 = c(5, 5), max_iter = 20)

Constrained Optimization

Use mle_constraint to define domain constraints:

# Constrain parameters to be positive
constraint <- mle_constraint(
  support = function(theta) all(theta > 0),
  project = function(theta) pmax(theta, 1e-8)
)

# Box constraints [0, 10]
box_constraint <- mle_constraint(
  support = function(theta) all(theta >= 0 & theta <= 10),
  project = function(theta) pmax(0, pmin(10, theta))
)

Meta-Solvers

Grid Search

Grid search evaluates the likelihood on a grid, useful for finding good starting points:

loglike <- function(theta) -(theta[1] - 3)^2 - (theta[2] + 2)^2

result <- mle_grid_search(
  loglike = loglike,
  lower = c(-5, -5),
  upper = c(5, 5),
  grid_size = 10
)

cat("Best grid point:", result$theta.hat, "\n")
#> Best grid point: 2.777778 -1.666667
cat("Points evaluated:", result$n_evaluated, "\n")
#> Points evaluated: 100

Random Restart

Random restart runs multiple optimizations from random starting points, helping escape local optima:

loglike <- function(theta) -sum(theta^2)
score <- function(theta) -2 * theta

# Define how to sample starting points
sampler <- function() runif(2, -10, 10)

result <- mle_random_restart(
  loglike = loglike,
  solver = mle_grad,
  theta0_sampler = sampler,
  n_trials = 10,
  score = score,
  max_iter = 30
)

cat("Best solution:", result$theta.hat, "\n")
#> Best solution: -4.606192e-05 2.045309e-05
cat("Successful trials:", result$successful_trials, "/", result$n_trials, "\n")
#> Successful trials: 10 / 10

Function Transformers

Stochastic Gradient Ascent

For large datasets, use subsampling to create stochastic gradients:

# Large dataset
data <- rnorm(100000, mean = 5, sd = 2)

loglike_full <- function(theta, obs = data) {
  sum(dnorm(obs, mean = theta[1], sd = theta[2], log = TRUE))
}

# Subsample only 100 observations per iteration
loglike_stoch <- with_subsampling(
  loglike_full,
  data = data,
  subsample_size = 100
)

# Each evaluation uses a different random subsample
loglike_stoch(c(5, 2))

Penalized Likelihood

Add regularization penalties to prevent overfitting:

loglike <- function(theta) -sum(theta^2)

# L1 penalty (LASSO)
loglike_l1 <- with_penalty(loglike, penalty_l1(), lambda = 0.1)

# L2 penalty (Ridge)
loglike_l2 <- with_penalty(loglike, penalty_l2(), lambda = 0.1)

# Elastic net (mix of L1 and L2)
loglike_enet <- with_penalty(
  loglike,
  penalty_elastic_net(alpha = 0.5),
  lambda = 0.1
)

# Compare values
theta <- c(1, 2, 3)
cat("Original:", loglike(theta), "\n")
#> Original: -14
cat("With L1:", loglike_l1(theta), "\n")
#> With L1: -14.6
cat("With L2:", loglike_l2(theta), "\n")
#> With L2: -15.4
cat("With Elastic Net:", loglike_enet(theta), "\n")
#> With Elastic Net: -15

Tracking Optimization Progress

Enable tracing to record the optimization path:

loglike <- function(theta) -(theta[1]^2 + theta[2]^2)
score <- function(theta) -2 * theta

config <- mle_config_linesearch(max_iter = 20, trace = TRUE)

result <- mle_gradient_ascent(
  loglike = loglike,
  score = score,
  theta0 = c(5, 5),
  config = config
)

# The path is stored in result$path
if (!is.null(result$path)) {
  cat("Optimization path (first 5 steps):\n")
  print(head(result$path, 5))
}
#> Optimization path (first 5 steps):
#>          [,1]     [,2]
#> [1,] 4.292893 4.292893
#> [2,] 3.585786 3.585786
#> [3,] 2.878680 2.878680
#> [4,] 2.171573 2.171573
#> [5,] 1.464466 1.464466

Complete Workflow Example

Here’s a complete example fitting a Poisson regression:

# Simulate Poisson data
set.seed(42)
n <- 200
x <- runif(n, 0, 5)
true_beta <- c(0.5, 0.3)  # intercept and slope
lambda_true <- exp(true_beta[1] + true_beta[2] * x)
y <- rpois(n, lambda_true)

# Log-likelihood for Poisson regression
loglike <- function(beta) {
  eta <- beta[1] + beta[2] * x
  lambda <- exp(eta)
  sum(dpois(y, lambda, log = TRUE))
}

# Score function
score <- function(beta) {
  eta <- beta[1] + beta[2] * x
  lambda <- exp(eta)
  resid <- y - lambda
  c(sum(resid), sum(resid * x))
}

# Fit model
result <- mle_grad(
  loglike, score,
  theta0 = c(0, 0),
  max_iter = 100
)

cat("Estimated coefficients:", result$theta.hat, "\n")
cat("True coefficients:", true_beta, "\n")

API Reference

The main functions in the package are:

Configuration:

mle_config() - Base configuration
mle_config_gradient() - Gradient descent configuration
mle_config_linesearch() - Line search configuration
mle_constraint() - Domain constraints

Solvers:

mle_gradient_ascent() - Gradient ascent optimizer
mle_newton_raphson() - Newton-Raphson optimizer
mle_grid_search() - Grid search
mle_random_restart() - Random restart wrapper

Convenience:

mle_grad() - Quick gradient ascent
mle_nr() - Quick Newton-Raphson

Transformers:

with_subsampling() - Stochastic gradient via subsampling
with_penalty() - Add penalty/regularization term
penalty_l1(), penalty_l2(), penalty_elastic_net() - Penalty functions

Alexander Towell

2025-12-02