Priority Tests to Add - Action Plan¶
This document provides concrete, copy-paste-ready test code to immediately improve test coverage for the RecursiveInfinigram system.
Phase 1: Critical Coverage Gaps (Add These First)¶
File: tests/test_recursive.py¶
Add these test classes to the existing test_recursive.py file:
class TestPredictionCombining:
"""Test prediction combining logic - CRITICAL GAP."""
def test_combine_empty_predictions_returns_empty(self):
"""
Given: Empty list of weighted predictions
When: Combining predictions
Then: Returns empty dictionary
"""
corpus = b"the cat sat on the mat"
model = RecursiveInfinigram(corpus)
result = model._combine_predictions([])
assert result == {}
assert isinstance(result, dict)
def test_combine_single_prediction_normalizes(self):
"""
Given: Single prediction with probabilities
When: Combining predictions
Then: Result is normalized to sum to 1.0
"""
corpus = b"the cat sat on the mat"
model = RecursiveInfinigram(corpus)
# Single prediction: 'a' (65) with 0.3, 'b' (66) with 0.7
weighted_predictions = [
({65: 0.3, 66: 0.7}, 1.0)
]
result = model._combine_predictions(weighted_predictions)
# Should normalize (already normalized in this case)
assert abs(sum(result.values()) - 1.0) < 1e-9, \
f"Expected sum=1.0, got {sum(result.values())}"
def test_combine_respects_weights(self):
"""
Given: Two predictions with different weights
When: Combining predictions
Then: Higher weight prediction contributes more
"""
corpus = b"the cat sat on the mat"
model = RecursiveInfinigram(corpus)
# Two predictions: high weight for 'A', low weight for 'B'
weighted_predictions = [
({65: 1.0}, 0.9), # 'A' with weight 0.9
({66: 1.0}, 0.1), # 'B' with weight 0.1
]
result = model._combine_predictions(weighted_predictions)
assert 65 in result and 66 in result
assert result[65] > result[66], \
f"Expected A (65) > B (66), got {result[65]} vs {result[66]}"
def test_combine_overlapping_predictions_sum(self):
"""
Given: Multiple predictions for the same byte
When: Combining predictions
Then: Probabilities are weighted and summed
"""
corpus = b"the cat sat on the mat"
model = RecursiveInfinigram(corpus)
# Both predict 'A' (65) with equal weight
weighted_predictions = [
({65: 0.5}, 0.5),
({65: 0.8}, 0.5),
]
result = model._combine_predictions(weighted_predictions)
# (0.5*0.5 + 0.8*0.5) / (0.5*0.5 + 0.8*0.5) = 1.0
assert 65 in result
assert abs(result[65] - 1.0) < 1e-9
def test_combine_multiple_bytes_multiple_predictions(self):
"""
Given: Multiple predictions with multiple bytes each
When: Combining predictions
Then: All bytes correctly weighted and normalized
"""
corpus = b"the cat sat on the mat"
model = RecursiveInfinigram(corpus)
weighted_predictions = [
({65: 0.7, 66: 0.3}, 0.6), # Weight 0.6
({65: 0.4, 67: 0.6}, 0.4), # Weight 0.4
]
result = model._combine_predictions(weighted_predictions)
# Should have all bytes
assert 65 in result # 'A'
assert 66 in result # 'B'
assert 67 in result # 'C'
# Should normalize to 1.0
total = sum(result.values())
assert abs(total - 1.0) < 1e-9, f"Expected sum=1.0, got {total}"
# 'A' appears in both, should have highest probability
assert result[65] > result[66]
assert result[65] > result[67]
class TestTransformerEdgeCases:
"""Test edge cases in transformers."""
def test_edit_distance_calculation_accuracy(self):
"""
Given: Pairs of words with known edit distances
When: Calculating Levenshtein distance
Then: Returns correct distance
"""
transformer = EditDistanceTransformer(max_distance=5)
# Test known distances
test_cases = [
(b"cat", b"cat", 0), # Identical
(b"cat", b"bat", 1), # One substitution
(b"cat", b"ca", 1), # One deletion
(b"cat", b"cart", 1), # One insertion
(b"kitten", b"sitting", 3), # Classic example
(b"", b"abc", 3), # Empty string
(b"abc", b"", 3), # Empty string
]
for word1, word2, expected_dist in test_cases:
actual_dist = transformer._edit_distance(word1, word2)
assert actual_dist == expected_dist, \
f"Edit distance {word1} → {word2}: expected {expected_dist}, got {actual_dist}"
def test_synonym_transformer_no_prefix_to_transform(self):
"""
Given: Context where suffix matches entire context
When: Generating transformations
Then: Returns empty list (no prefix to transform)
"""
corpus = b"the cat sat on the mat"
transformer = SynonymTransformer()
# Suffix equals entire context
context = b"sat"
suffix = b"sat"
positions = [8] # Position of "sat" in corpus
transformations = transformer.generate_transformations(
context=context,
suffix=suffix,
corpus=corpus,
match_positions=positions
)
assert transformations == []
def test_edit_distance_transformer_no_prefix_to_transform(self):
"""
Given: Context where suffix matches entire context
When: Generating transformations
Then: Returns empty list (no prefix to transform)
"""
corpus = b"the cat sat on the mat"
transformer = EditDistanceTransformer(max_distance=2)
context = b"mat"
suffix = b"mat"
positions = [19]
transformations = transformer.generate_transformations(
context=context,
suffix=suffix,
corpus=corpus,
match_positions=positions
)
assert transformations == []
def test_case_normalizer_already_lowercase(self):
"""
Given: Context that is already lowercase
When: Generating transformations
Then: Returns empty list (no transformation needed)
"""
transformer = CaseNormalizer()
context = b"the cat sat"
suffix = b"sat"
corpus = b"irrelevant"
match_positions = []
transformations = transformer.generate_transformations(
context=context,
suffix=suffix,
corpus=corpus,
match_positions=match_positions
)
assert transformations == []
class TestRecursiveTransformDepthAndBeam:
"""Test recursive transformation with various depths and beam widths."""
def test_beam_width_one_limits_candidates(self):
"""
Given: Beam width of 1
When: Generating transformations recursively
Then: Only best candidate is explored at each level
"""
corpus = b"the cat sat on the mat. the dog ran fast."
model = RecursiveInfinigram(corpus)
context = b"The Cat"
# Beam width = 1 should still work
contexts = model._recursive_transform(
context=context,
depth=0,
max_depth=2,
seen=set(),
beam_width=1
)
# Should have at least original
assert len(contexts) >= 1
def test_large_beam_width_explores_more(self):
"""
Given: Large beam width
When: Generating transformations recursively
Then: More candidates are explored
"""
corpus = b"the cat sat on the mat. the dog ran fast."
model = RecursiveInfinigram(corpus)
context = b"The Cat"
# Large beam should explore more
contexts = model._recursive_transform(
context=context,
depth=0,
max_depth=2,
seen=set(),
beam_width=10
)
# Should have original + transformations
assert len(contexts) >= 1
def test_no_matches_returns_only_original(self):
"""
Given: Context that has no matches in corpus
When: Generating transformations recursively
Then: Returns only original context (no transformations possible)
"""
corpus = b"the cat sat on the mat"
model = RecursiveInfinigram(corpus)
# Context completely outside corpus vocabulary
context = b"xyz"
contexts = model._recursive_transform(
context=context,
depth=0,
max_depth=2,
seen=set(),
beam_width=5
)
# Should return only original (no matches to transform from)
assert len(contexts) == 1
assert contexts[0][0] == context
assert contexts[0][1] == []
Phase 2: Integration Tests¶
New File: tests/test_recursive_integration.py¶
Create this new file with integration tests:
#!/usr/bin/env python3
"""
Integration tests for RecursiveInfinigram end-to-end workflows.
Tests the full pipeline: Context → Transformers → Scorer → Predictor
"""
import pytest
from infinigram.recursive import RecursiveInfinigram, CaseNormalizer, EditDistanceTransformer
from infinigram.scoring import create_conservative_scorer, create_aggressive_scorer
class TestEndToEndPredictionFlow:
"""Test complete prediction flow from context to output."""
def test_case_normalization_enables_prediction(self):
"""
Given: Corpus with lowercase text
When: Context has uppercase letters
Then: Case normalization enables successful prediction
"""
corpus = b"the cat sat on the mat"
model = RecursiveInfinigram(
corpus,
transformers=[CaseNormalizer()]
)
# Uppercase context (not in corpus)
context = b"The Cat"
probs = model.predict(context, max_depth=2, beam_width=5)
# Should make some prediction (via case normalization)
# Can't guarantee specific prediction, but should not be empty
assert isinstance(probs, dict)
# If case normalization works, should find match and predict
def test_prediction_with_explanation_includes_transformations(self):
"""
Given: Context requiring transformation
When: Predicting with explanation
Then: Explanations include transformation details
"""
corpus = b"the cat sat on the mat"
model = RecursiveInfinigram(corpus)
context = b"The Cat" # Uppercase
probs, explanations = model.predict_with_explanation(
context,
max_depth=2,
beam_width=3
)
# Should have explanations
assert isinstance(explanations, list)
assert len(explanations) > 0
# Check explanation structure
for exp in explanations:
assert 'context' in exp
assert 'transformations' in exp
assert 'match_length' in exp
assert 'match_frequency' in exp
assert 'weight' in exp
assert 'predictions' in exp
# Weight should be in valid range
assert 0.0 <= exp['weight'] <= 1.0
class TestScorerImpactOnPredictions:
"""Test that different scorers affect prediction outcomes."""
def test_conservative_vs_aggressive_scorer_behavior(self):
"""
Given: Same corpus and context
When: Using conservative vs aggressive scorer
Then: Scorers produce different weight distributions
"""
corpus = b"the quick brown fox jumps over the lazy dog"
conservative_model = RecursiveInfinigram(
corpus,
scorer=create_conservative_scorer()
)
aggressive_model = RecursiveInfinigram(
corpus,
scorer=create_aggressive_scorer()
)
# Context with case difference
context = b"The Quick"
_, conservative_explanations = conservative_model.predict_with_explanation(
context, max_depth=2
)
_, aggressive_explanations = aggressive_model.predict_with_explanation(
context, max_depth=2
)
# Both should generate explanations
assert len(conservative_explanations) > 0
assert len(aggressive_explanations) > 0
# Weights should differ between scorers
# (Conservative penalizes transformations more)
conservative_weights = [exp['weight'] for exp in conservative_explanations]
aggressive_weights = [exp['weight'] for exp in aggressive_explanations]
# At least check they computed weights
assert all(w >= 0 for w in conservative_weights)
assert all(w >= 0 for w in aggressive_weights)
class TestTransformationChaining:
"""Test multiple transformations in sequence."""
def test_multiple_transformations_tracked_in_explanation(self):
"""
Given: Context requiring multiple transformations
When: Recursing with max_depth > 1
Then: Explanation shows chain of transformations
"""
corpus = b"the cat sat on the mat"
model = RecursiveInfinigram(corpus)
context = b"The Dog" # Both case and word difference
probs, explanations = model.predict_with_explanation(
context,
max_depth=3, # Allow chaining
beam_width=5
)
# Should have explanations with varying transformation depths
assert len(explanations) > 0
# Check if any explanation has multiple transformations
has_chain = any(len(exp['transformations']) > 1 for exp in explanations)
# Check transformation list structure
for exp in explanations:
assert isinstance(exp['transformations'], list)
for transform_desc in exp['transformations']:
assert isinstance(transform_desc, str)
Phase 3: Robustness and Error Handling¶
Add to tests/test_recursive.py:¶
class TestRecursiveInfinigramRobustness:
"""Test robustness and error handling."""
def test_empty_corpus_initialization(self):
"""
Given: Empty corpus
When: Initializing RecursiveInfinigram
Then: Initializes without crashing (may have no predictions)
"""
corpus = b""
# Should not crash
model = RecursiveInfinigram(corpus)
assert model.corpus == corpus
assert model.model is not None
def test_empty_context_prediction(self):
"""
Given: Empty context
When: Making prediction
Then: Returns empty dict (no context to match)
"""
corpus = b"the cat sat on the mat"
model = RecursiveInfinigram(corpus)
context = b""
probs = model.predict(context, max_depth=1)
# Should return dict (possibly empty)
assert isinstance(probs, dict)
def test_context_longer_than_corpus(self):
"""
Given: Context longer than entire corpus
When: Making prediction
Then: Handles gracefully (no match expected)
"""
corpus = b"cat"
model = RecursiveInfinigram(corpus)
context = b"the quick brown fox jumps over the lazy dog"
probs = model.predict(context, max_depth=1)
# Should return dict (likely empty)
assert isinstance(probs, dict)
def test_unicode_handling_in_corpus(self):
"""
Given: Corpus with UTF-8 characters
When: Making predictions
Then: Handles unicode correctly
"""
corpus = "the café is open".encode('utf-8')
model = RecursiveInfinigram(corpus)
context = "the café".encode('utf-8')
probs = model.predict(context, max_depth=1)
assert isinstance(probs, dict)
def test_very_deep_recursion_does_not_crash(self):
"""
Given: Very deep max_depth
When: Making prediction
Then: Does not cause stack overflow
"""
corpus = b"the cat sat on the mat"
model = RecursiveInfinigram(corpus)
context = b"The Cat"
# Very deep recursion (should be stopped by max_depth)
probs = model.predict(context, max_depth=10, beam_width=2)
assert isinstance(probs, dict)
Phase 4: Fill Evaluation Coverage Gaps¶
Add to tests/test_evaluation.py:¶
class TestEvaluatorEdgeCases:
"""Test evaluator edge cases for full coverage."""
def test_evaluate_with_verbose_output(self):
"""
Given: Test data with multiple samples
When: Evaluating with verbose=True
Then: Prints progress messages (covers logging lines)
"""
corpus = b"the cat sat on the mat"
model = Infinigram(corpus)
evaluator = Evaluator(model, "Test")
# Create 100 samples to trigger progress printing
test_data = [(b"the", b" ")] * 100
# Should print progress without crashing
metrics, results = evaluator.evaluate(test_data, top_k=5, verbose=True)
assert len(results) == 100
assert isinstance(metrics, EvaluationMetrics)
def test_evaluate_with_no_predictions(self):
"""
Given: Model that never returns predictions
When: Evaluating
Then: Handles gracefully with inf perplexity
"""
# Mock model that always returns empty dict
class NoOpModel:
def predict(self, context, top_k=10):
return {}
model = NoOpModel()
evaluator = Evaluator(model, "NoOp")
test_data = [(b"test", b"x"), (b"data", b"y")]
metrics, results = evaluator.evaluate(test_data, top_k=5)
# Coverage should be 0%
assert metrics.coverage == 0.0
# Perplexity should be infinity (no predictions)
assert metrics.perplexity == float('inf')
# Mean probability should be 0
assert metrics.mean_probability == 0.0
# All predictions should be None
assert all(r.predicted is None for r in results)
class TestBenchmarkSuiteVerbose:
"""Test benchmark suite with verbose output."""
def test_compare_models_with_verbose(self):
"""
Given: Multiple models and datasets
When: Comparing with verbose=True
Then: Prints comparison info (covers logging lines)
"""
corpus = b"the cat sat on the mat"
models = {
"Vanilla": Infinigram(corpus),
"Recursive": RecursiveInfinigram(corpus),
}
suite = BenchmarkSuite(corpus)
test_data = suite.create_in_distribution_test(10, 5)
test_datasets = {
"Test": test_data,
}
# Should print verbose output
results = suite.compare_models(
models=models,
test_datasets=test_datasets,
top_k=5,
verbose=True # Enable verbose logging
)
assert "Vanilla" in results
assert "Recursive" in results
Test Execution Plan¶
Step 1: Add Phase 1 Tests (Critical)¶
# Add the TestPredictionCombining class to test_recursive.py
# Add the TestTransformerEdgeCases class to test_recursive.py
# Add the TestRecursiveTransformDepthAndBeam class to test_recursive.py
# Run tests
python -m pytest tests/test_recursive.py -v
# Check coverage improvement
python -m pytest tests/test_recursive.py --cov=infinigram.recursive --cov-report=term
Expected Coverage Improvement: 41% → ~60%
Step 2: Add Phase 2 Tests (Integration)¶
# Create tests/test_recursive_integration.py with integration tests
# Run integration tests
python -m pytest tests/test_recursive_integration.py -v
# Check full coverage
python -m pytest tests/test_recursive*.py --cov=infinigram.recursive --cov-report=term
Expected Coverage Improvement: ~60% → ~75%
Step 3: Add Phase 3 Tests (Robustness)¶
# Add TestRecursiveInfinigramRobustness to test_recursive.py
# Run all recursive tests
python -m pytest tests/test_recursive*.py -v
# Final coverage check
python -m pytest tests/test_recursive*.py --cov=infinigram.recursive --cov-report=term
Expected Coverage Improvement: ~75% → ~85%
Step 4: Add Phase 4 Tests (Evaluation gaps)¶
# Add edge case tests to test_evaluation.py
# Run evaluation tests
python -m pytest tests/test_evaluation.py -v
# Check coverage
python -m pytest tests/test_evaluation.py --cov=infinigram.evaluation --cov-report=term
Expected Coverage Improvement: 93% → 98%
Success Criteria¶
After adding all Phase 1-4 tests:
- ✅
infinigram/recursive.py: 85%+ coverage (up from 41%) - ✅
infinigram/evaluation.py: 98%+ coverage (up from 93%) - ✅
infinigram/scoring.py: 100% coverage (maintained) - ✅ All tests pass
- ✅ No implementation changes needed (tests verify existing behavior)
Notes on Test Philosophy¶
These tests follow TDD best practices:
- Test Behavior, Not Implementation
- Focus on observable outcomes (predictions, scores, transformations)
-
Don't test internal data structures unless they're part of the contract
-
Clear Given-When-Then Structure
- Each test has clear setup, action, and assertion
-
Test names describe the behavior being tested
-
Independent Tests
- Each test can run in any order
- No shared state between tests
-
Fresh model instances per test
-
Focused Assertions
- Each test verifies one behavior
- Assertions have helpful error messages
-
Edge cases are explicit
-
Resilient to Refactoring
- Tests will pass even if internal implementation changes
- Only break if actual behavior changes
- Enable confident refactoring