Replay Examples¶
Practical examples for deterministic replay and A/B testing.
Basic Exact Replay¶
Re-execute a captured trace without making API calls:
from prela.replay import ReplayEngine
from prela.replay.loader import TraceLoader
# Load trace from file
trace = TraceLoader.from_file("traces.jsonl")
# Create replay engine
engine = ReplayEngine(trace)
# Exact replay (deterministic, no API calls)
result = engine.replay_exact()
# Inspect results
print(f"Trace ID: {result.trace_id}")
print(f"Total Spans: {len(result.spans)}")
print(f"Duration: {result.total_duration_ms:.2f}ms")
print(f"Total Tokens: {result.total_tokens}")
print(f"Estimated Cost: ${result.total_cost_usd:.4f}")
# Examine individual spans
for span in result.spans:
print(f"\nSpan: {span.name}")
print(f" Type: {span.span_type}")
print(f" Duration: {span.duration_ms:.2f}ms")
print(f" Output: {span.output[:100]}...") # First 100 chars
A/B Testing: Compare Models¶
Test GPT-4 vs Claude Sonnet:
from prela.replay import ReplayEngine, compare_replays
from prela.replay.loader import TraceLoader
# Load trace (originally GPT-4)
trace = TraceLoader.from_file("gpt4_trace.jsonl")
engine = ReplayEngine(trace)
# Baseline: Original execution
original = engine.replay_exact()
# Experiment: Claude Sonnet
claude_result = engine.replay_with_modifications(
model="claude-sonnet-4-20250514"
)
# Compare
comparison = compare_replays(original, claude_result)
# Print summary
print(comparison.generate_summary())
# Detailed analysis
print("\n=== Cost Analysis ===")
print(f"GPT-4 Cost: ${original.total_cost_usd:.4f}")
print(f"Claude Cost: ${claude_result.total_cost_usd:.4f}")
print(f"Savings: ${original.total_cost_usd - claude_result.total_cost_usd:.4f}")
print("\n=== Quality Analysis ===")
for diff in comparison.differences:
if diff.field == "output" and diff.semantic_similarity:
print(f"{diff.span_name}:")
print(f" Semantic Similarity: {diff.semantic_similarity:.1%}")
if diff.semantic_similarity > 0.85:
print(" ✓ High quality match")
else:
print(" ⚠️ Significant divergence")
Parameter Tuning: Temperature¶
Find optimal temperature setting:
from prela.replay import ReplayEngine
from prela.replay.loader import TraceLoader
trace = TraceLoader.from_file("trace.jsonl")
engine = ReplayEngine(trace)
# Test different temperatures
temperatures = [0.0, 0.3, 0.5, 0.7, 1.0]
results = {}
for temp in temperatures:
result = engine.replay_with_modifications(temperature=temp)
results[temp] = result
# Compare outputs
print("Temperature Comparison")
print("=" * 50)
for temp, result in results.items():
print(f"\nTemperature: {temp}")
print(f" Tokens: {result.total_tokens}")
print(f" Cost: ${result.total_cost_usd:.4f}")
print(f" Duration: {result.total_duration_ms:.0f}ms")
# Show first span output
if result.spans:
output = result.spans[0].output
print(f" Output preview: {output[:80]}...")
Batch Model Comparison¶
Compare multiple models and configurations:
from prela.replay import ReplayEngine, compare_replays
from prela.replay.loader import TraceLoader
import json
trace = TraceLoader.from_file("trace.jsonl")
engine = ReplayEngine(trace)
# Baseline
baseline = engine.replay_exact()
# Experiments
experiments = {
"gpt-4o": {"model": "gpt-4o"},
"gpt-4o-temp0.7": {"model": "gpt-4o", "temperature": 0.7},
"claude-sonnet": {"model": "claude-sonnet-4-20250514"},
"claude-haiku": {"model": "claude-3-haiku-20240307"},
}
results = {}
comparisons = {}
# Run experiments
for name, modifications in experiments.items():
print(f"Running: {name}...")
result = engine.replay_with_modifications(**modifications)
results[name] = result
comparisons[name] = compare_replays(baseline, result)
# Generate report
report = {
"baseline": {
"cost": baseline.total_cost_usd,
"tokens": baseline.total_tokens,
"duration_ms": baseline.total_duration_ms,
},
"experiments": {}
}
for name, result in results.items():
comparison = comparisons[name]
report["experiments"][name] = {
"cost": result.total_cost_usd,
"cost_delta": result.total_cost_usd - baseline.total_cost_usd,
"tokens": result.total_tokens,
"tokens_delta": result.total_tokens - baseline.total_tokens,
"duration_ms": result.total_duration_ms,
"avg_similarity": sum(
d.semantic_similarity or 0
for d in comparison.differences
if d.semantic_similarity
) / len([d for d in comparison.differences if d.semantic_similarity])
if [d for d in comparison.differences if d.semantic_similarity]
else None,
}
# Save report
with open("experiment_report.json", "w") as f:
json.dump(report, f, indent=2)
print("\n=== Experiment Report ===")
print(json.dumps(report, indent=2))
# Find best option
best_cost = min(results.items(), key=lambda x: x[1].total_cost_usd)
print(f"\nLowest cost: {best_cost[0]} (${best_cost[1].total_cost_usd:.4f})")
System Prompt Testing¶
Test different instruction styles:
from prela.replay import ReplayEngine, compare_replays
from prela.replay.loader import TraceLoader
trace = TraceLoader.from_file("trace.jsonl")
engine = ReplayEngine(trace)
# Original
original = engine.replay_exact()
# Test variations
prompts = {
"concise": "You are a helpful assistant. Be concise and direct.",
"detailed": "You are a helpful assistant. Provide detailed explanations with examples.",
"technical": "You are a technical expert. Use precise terminology and cite sources.",
}
results = {}
for name, prompt in prompts.items():
result = engine.replay_with_modifications(system_prompt=prompt)
results[name] = result
# Analyze outputs
print("System Prompt Comparison")
print("=" * 50)
for name, result in results.items():
comparison = compare_replays(original, result)
print(f"\n{name.upper()}")
print(f" Prompt: {prompts[name]}")
print(f" Output length: {len(result.spans[0].output) if result.spans else 0} chars")
print(f" Tokens: {result.total_tokens}")
print(f" Cost: ${result.total_cost_usd:.4f}")
# Semantic similarity
similarities = [
d.semantic_similarity
for d in comparison.differences
if d.semantic_similarity
]
if similarities:
avg_sim = sum(similarities) / len(similarities)
print(f" Avg similarity: {avg_sim:.1%}")
Regression Testing¶
Ensure new model versions don't break functionality:
from prela.replay import ReplayEngine, compare_replays
from prela.replay.loader import TraceLoader
def regression_test(trace_file, new_model, similarity_threshold=0.85):
"""
Test if new model produces similar results to original.
Returns:
dict: Test results with pass/fail status
"""
trace = TraceLoader.from_file(trace_file)
engine = ReplayEngine(trace)
# Baseline
original = engine.replay_exact()
# New version
new_result = engine.replay_with_modifications(model=new_model)
# Compare
comparison = compare_replays(original, new_result)
# Analyze
passed_spans = []
failed_spans = []
for diff in comparison.differences:
if diff.field == "output" and diff.semantic_similarity:
if diff.semantic_similarity >= similarity_threshold:
passed_spans.append({
"name": diff.span_name,
"similarity": diff.semantic_similarity,
})
else:
failed_spans.append({
"name": diff.span_name,
"similarity": diff.semantic_similarity,
"original": diff.original_value[:200],
"new": diff.new_value[:200],
})
# Results
total_spans = len(passed_spans) + len(failed_spans)
pass_rate = len(passed_spans) / total_spans if total_spans > 0 else 0
return {
"passed": pass_rate >= 0.95, # 95% pass rate required
"pass_rate": pass_rate,
"passed_spans": len(passed_spans),
"failed_spans": len(failed_spans),
"failures": failed_spans,
}
# Run regression tests
test_files = [
"test_cases/customer_support.jsonl",
"test_cases/code_review.jsonl",
"test_cases/data_analysis.jsonl",
]
for test_file in test_files:
print(f"\nTesting: {test_file}")
result = regression_test(test_file, new_model="gpt-4o")
if result["passed"]:
print(f"✓ PASSED ({result['pass_rate']:.1%} similarity)")
else:
print(f"✗ FAILED ({result['pass_rate']:.1%} similarity)")
print(f" Failed spans: {result['failed_spans']}")
for failure in result["failures"]:
print(f"\n - {failure['name']} ({failure['similarity']:.1%})")
print(f" Original: {failure['original']}...")
print(f" New: {failure['new']}...")
Cost Optimization¶
Find the cheapest model that maintains quality:
from prela.replay import ReplayEngine, compare_replays
from prela.replay.loader import TraceLoader
def find_optimal_model(trace_file, models, min_similarity=0.85):
"""
Find cheapest model with acceptable quality.
Args:
trace_file: Path to trace file
models: List of model names to test
min_similarity: Minimum semantic similarity threshold
Returns:
dict: Best model and cost analysis
"""
trace = TraceLoader.from_file(trace_file)
engine = ReplayEngine(trace)
# Baseline
original = engine.replay_exact()
# Test models
candidates = []
for model in models:
result = engine.replay_with_modifications(model=model)
comparison = compare_replays(original, result)
# Calculate average similarity
similarities = [
d.semantic_similarity
for d in comparison.differences
if d.semantic_similarity
]
avg_similarity = sum(similarities) / len(similarities) if similarities else 0
if avg_similarity >= min_similarity:
candidates.append({
"model": model,
"cost": result.total_cost_usd,
"similarity": avg_similarity,
"tokens": result.total_tokens,
})
# Sort by cost (cheapest first)
candidates.sort(key=lambda x: x["cost"])
return {
"original_cost": original.total_cost_usd,
"original_model": original.spans[0].attributes.get("llm.model") if original.spans else None,
"candidates": candidates,
"best_option": candidates[0] if candidates else None,
}
# Test model lineup
models_to_test = [
"gpt-4o",
"gpt-4o-mini",
"claude-sonnet-4-20250514",
"claude-3-haiku-20240307",
]
result = find_optimal_model("trace.jsonl", models_to_test)
print("Cost Optimization Results")
print("=" * 50)
print(f"Original: {result['original_model']} (${result['original_cost']:.4f})")
if result["best_option"]:
best = result["best_option"]
savings = result["original_cost"] - best["cost"]
savings_pct = (savings / result["original_cost"]) * 100
print(f"\n✓ Best Option: {best['model']}")
print(f" Cost: ${best['cost']:.4f}")
print(f" Savings: ${savings:.4f} ({savings_pct:.1f}%)")
print(f" Quality: {best['similarity']:.1%} similar")
print(f" Tokens: {best['tokens']}")
print("\nAll Candidates:")
for candidate in result["candidates"]:
print(f" - {candidate['model']}: ${candidate['cost']:.4f} ({candidate['similarity']:.1%})")
else:
print("\n✗ No models met quality threshold")
Mock Tool Responses¶
Test different tool outputs:
from prela.replay import ReplayEngine
from prela.replay.loader import TraceLoader
trace = TraceLoader.from_file("trace_with_tools.jsonl")
engine = ReplayEngine(trace)
# Original execution
original = engine.replay_exact()
# Test with different tool responses
mock_responses = {
"search": {
"results": [
{"title": "Alternative Result 1", "url": "https://example.com/1"},
{"title": "Alternative Result 2", "url": "https://example.com/2"},
]
},
"calculator": {
"result": 42 # Different calculation result
},
}
modified = engine.replay_with_modifications(
mock_tool_responses=mock_responses
)
# Compare how agent adapts to different tool outputs
print("Tool Response Testing")
print("=" * 50)
print("\nOriginal Tool Outputs:")
for span in original.spans:
if span.span_type == "tool":
print(f" {span.name}: {span.output[:100]}...")
print("\nModified Tool Outputs:")
for span in modified.spans:
if span.span_type == "tool":
print(f" {span.name}: {span.output[:100]}...")
print("\nAgent Behavior Changes:")
# Compare final outputs
original_output = original.spans[-1].output if original.spans else ""
modified_output = modified.spans[-1].output if modified.spans else ""
print(f"Original: {original_output[:200]}...")
print(f"Modified: {modified_output[:200]}...")
Tool Re-execution¶
Re-execute tools during replay with safety controls.
Basic Tool Re-execution¶
from prela.replay import ReplayEngine
from prela.replay.loader import TraceLoader
# Load trace with tool calls
trace = TraceLoader.from_file("trace_with_tools.jsonl")
engine = ReplayEngine(trace)
# Define tool implementations
def my_calculator(input_data):
"""Calculate sum of two numbers."""
a = input_data.get("a", 0)
b = input_data.get("b", 0)
result = a + b
print(f"Calculator: {a} + {b} = {result}")
return {"result": result}
def my_search(input_data):
"""Search for information."""
query = input_data.get("query", "")
print(f"Searching for: {query}")
# Simulate fresh search
return {
"results": [
{"title": "Fresh Result 1", "url": "https://example.com/1"},
{"title": "Fresh Result 2", "url": "https://example.com/2"},
]
}
# Create tool registry
tool_registry = {
"calculator": my_calculator,
"search": my_search,
}
# Original execution (cached data)
original = engine.replay_exact()
# Re-execute tools with current implementations
modified = engine.replay_with_modifications(
enable_tool_execution=True,
tool_registry=tool_registry,
)
# Compare results
print("\nTool Re-execution Comparison")
print("=" * 60)
for orig_span, mod_span in zip(original.spans, modified.spans):
if orig_span.span_type == "tool":
print(f"\n{orig_span.name}:")
print(f" Cached: {orig_span.output}")
print(f" Fresh: {mod_span.output}")
print(f" Changed: {orig_span.output != mod_span.output}")
Using Allowlist for Safety¶
from prela.replay import ReplayEngine
from prela.replay.loader import TraceLoader
trace = TraceLoader.from_file("trace_with_tools.jsonl")
engine = ReplayEngine(trace)
# Define tools
def safe_calculator(input_data):
return {"result": input_data["a"] + input_data["b"]}
def dangerous_delete(input_data):
# This should never execute!
raise RuntimeError("This tool should be blocked!")
tool_registry = {
"calculator": safe_calculator,
"delete_file": dangerous_delete,
}
# Only allow calculator to execute
result = engine.replay_with_modifications(
enable_tool_execution=True,
tool_execution_allowlist=["calculator"], # Only allow this
tool_registry=tool_registry,
)
# Check which tools ran
for span in result.spans:
if span.span_type == "tool":
if span.error:
print(f"❌ {span.name}: {span.error}")
else:
print(f"✅ {span.name}: {span.output}")
# Output:
# ✅ calculator: {'result': 10}
# ❌ delete_file: Tool 'delete_file' not in allowlist
Using Blocklist for Safety¶
# Block dangerous tools explicitly
result = engine.replay_with_modifications(
enable_tool_execution=True,
tool_execution_blocklist=["delete_file", "shutdown", "execute_code"],
tool_registry=tool_registry,
)
# Blocked tools will fail with error
for span in result.spans:
if span.span_type == "tool" and span.error:
print(f"🚫 Blocked: {span.name}")
Testing with Mock vs Real Execution¶
# Priority 1: Mocks (highest)
mock_result = engine.replay_with_modifications(
mock_tool_responses={
"search": {"results": ["Mocked result"]}
}
)
# Priority 2: Real execution (if no mocks)
real_result = engine.replay_with_modifications(
enable_tool_execution=True,
tool_registry=tool_registry,
)
# Priority 3: Cached (default, if execution not enabled)
cached_result = engine.replay_exact()
print("Mock output:", mock_result.spans[0].output)
print("Real output:", real_result.spans[0].output)
print("Cached output:", cached_result.spans[0].output)
Retrieval Re-execution¶
Re-query vector databases to test with updated data.
ChromaDB Re-execution¶
from prela.replay import ReplayEngine
from prela.replay.loader import TraceLoader
import chromadb
# Load trace with retrieval spans
trace = TraceLoader.from_file("trace_with_retrieval.jsonl")
engine = ReplayEngine(trace)
# Setup ChromaDB with fresh data
client = chromadb.Client()
collection = client.create_collection("my_docs")
# Add updated documents
collection.add(
documents=[
"Python is a high-level programming language",
"JavaScript is widely used for web development",
"Rust provides memory safety without garbage collection",
],
ids=["1", "2", "3"],
)
# Original execution (cached documents)
original = engine.replay_exact()
# Re-query with current vector store
modified = engine.replay_with_modifications(
enable_retrieval_execution=True,
retrieval_client=collection,
)
# Compare retrieved documents
print("Retrieval Comparison")
print("=" * 60)
for orig_span, mod_span in zip(original.spans, modified.spans):
if orig_span.span_type == "retrieval":
print(f"\nQuery: {orig_span.input}")
print(f"\nOriginal Documents ({len(orig_span.output)}):")
for doc in orig_span.output[:2]:
print(f" - {doc.get('text', '')[:80]}...")
print(f"\nFresh Documents ({len(mod_span.output)}):")
for doc in mod_span.output[:2]:
print(f" - {doc.get('text', '')[:80]}...")
Query Override¶
# Original query: "What is Python?"
# Test with different query
result = engine.replay_with_modifications(
enable_retrieval_execution=True,
retrieval_client=collection,
retrieval_query_override="What is JavaScript?",
)
print(f"Original query: {original.spans[0].input}")
print(f"New query: {result.spans[0].input}")
print(f"Results changed: {original.spans[0].output != result.spans[0].output}")
A/B Testing Retrieval Strategies¶
# Test multiple query formulations
queries = [
"Python programming language features", # Detailed
"Python features", # Concise
"What makes Python popular?", # Question form
]
results = {}
for query in queries:
result = engine.replay_with_modifications(
enable_retrieval_execution=True,
retrieval_client=collection,
retrieval_query_override=query,
)
# Analyze results
results[query] = {
"doc_count": len(result.spans[0].output),
"avg_score": sum(d.get("score", 0) for d in result.spans[0].output) / len(result.spans[0].output) if result.spans[0].output else 0,
}
# Find best query
best_query = max(results.items(), key=lambda x: x[1]["avg_score"])
print(f"\nBest Query: {best_query[0]}")
print(f"Avg Score: {best_query[1]['avg_score']:.2f}")
Custom Retry Configuration¶
Configure retry behavior for different scenarios.
High-Retry Configuration (Flaky Networks)¶
from prela.replay import ReplayEngine
from prela.replay.loader import TraceLoader
trace = TraceLoader.from_file("trace.jsonl")
# More retries, longer delays for unreliable connections
engine = ReplayEngine(
trace,
max_retries=5, # More attempts
retry_initial_delay=2.0, # Start with 2s delay
retry_max_delay=120.0, # Allow up to 2 minutes
retry_exponential_base=2.0,
)
result = engine.replay_with_modifications(model="gpt-4o")
# Check retry statistics
total_retries = sum(span.retry_count for span in result.spans)
print(f"Total retries needed: {total_retries}")
# Show which spans needed retries
for span in result.spans:
if span.retry_count > 0:
print(f"⚠️ {span.name}: {span.retry_count} retries")
Fast-Fail Configuration (Production)¶
# Minimal retries for production (fail fast)
engine = ReplayEngine(
trace,
max_retries=1, # Only 1 retry
retry_initial_delay=0.5, # Quick retry
retry_max_delay=2.0, # Max 2s delay
)
result = engine.replay_with_modifications(model="gpt-4o")
Monitoring Retry Patterns¶
import time
start = time.time()
result = engine.replay_with_modifications(model="gpt-4o")
elapsed = time.time() - start
# Analyze retry impact
retry_stats = {
"total_spans": len(result.spans),
"spans_with_retries": sum(1 for s in result.spans if s.retry_count > 0),
"total_retries": sum(s.retry_count for s in result.spans),
"elapsed_time": elapsed,
}
print("\nRetry Statistics")
print("=" * 60)
print(f"Total Spans: {retry_stats['total_spans']}")
print(f"Spans with Retries: {retry_stats['spans_with_retries']}")
print(f"Total Retry Attempts: {retry_stats['total_retries']}")
print(f"Elapsed Time: {retry_stats['elapsed_time']:.2f}s")
# Calculate estimated retry overhead
base_time = result.total_duration_ms / 1000
retry_overhead = elapsed - base_time
print(f"Estimated Retry Overhead: {retry_overhead:.2f}s")
Semantic Fallback Example¶
Compare accuracy with and without sentence-transformers.
Using Fallback (No Dependencies)¶
from prela.replay import ReplayEngine, compare_replays
from prela.replay.loader import TraceLoader
trace = TraceLoader.from_file("trace.jsonl")
engine = ReplayEngine(trace)
# Original execution
original = engine.replay_exact()
# Modified execution
modified = engine.replay_with_modifications(
model="gpt-4o",
temperature=0.7,
)
# Compare (will use fallback if sentence-transformers not installed)
comparison = compare_replays(original, modified)
# Check which method was used
if comparison.semantic_similarity_available:
print(f"✅ Using embeddings: {comparison.semantic_similarity_model}")
else:
print("⚠️ Using fallback: difflib + Jaccard")
# Show similarities
print("\nSemantic Similarities:")
for diff in comparison.differences:
if diff.field == "output" and diff.semantic_similarity:
method = "embeddings" if comparison.semantic_similarity_available else "fallback"
print(f"{diff.span_name}: {diff.semantic_similarity:.1%} ({method})")
Accuracy Comparison¶
# Test fallback accuracy
test_pairs = [
("Hello World", "hello world"), # Case change
("The quick brown fox", "the fast brown fox"), # Word change
("cat dog bird", "dog bird cat"), # Word reorder
("Python programming", "JavaScript programming"), # Different topic
]
print("\nFallback Accuracy Test")
print("=" * 60)
from prela.replay.comparison import ReplayComparator
# Use fallback explicitly
comparator = ReplayComparator(use_semantic_similarity=False)
for text1, text2 in test_pairs:
similarity = comparator._compute_fallback_similarity(text1, text2)
print(f"\n'{text1}' vs '{text2}'")
print(f" Similarity: {similarity:.2%}")
if similarity > 0.9:
status = "✅ Highly similar"
elif similarity > 0.7:
status = "⚠️ Moderately similar"
else:
status = "❌ Different"
print(f" Status: {status}")
When to Install sentence-transformers¶
# Check if you need better accuracy
comparison = compare_replays(original, modified)
low_confidence_count = sum(
1 for diff in comparison.differences
if diff.semantic_similarity and 0.6 < diff.semantic_similarity < 0.8
)
if low_confidence_count > 5 and not comparison.semantic_similarity_available:
print("⚠️ Consider installing sentence-transformers for better accuracy")
print(" pip install prela[similarity]")
print(f" {low_confidence_count} similarities in ambiguous range (60-80%)")
else:
print("✅ Fallback accuracy is sufficient for this use case")
CLI Workflow¶
Using the command-line interface:
1. Capture Trace with Replay Data¶
# Enable replay capture in your application
export PRELA_CAPTURE_FOR_REPLAY=true
# Run your application
python my_agent.py
# Traces saved to traces.jsonl with replay data
2. Exact Replay (Verify)¶
# Quick verification
prela replay traces.jsonl
# Output:
# Trace ID: abc-123
# Duration: 2.5s
# Tokens: 1,234
# Cost: $0.0185
3. Modified Replay (Experiment)¶
# Test with GPT-4o
prela replay traces.jsonl --model gpt-4o --compare
# Output:
# Original: gpt-4 ($0.0185)
# Modified: gpt-4o ($0.0092)
# Savings: $0.0093 (50.3%)
# Avg Similarity: 87.2%
4. Save Results¶
# Export comparison to JSON
prela replay traces.jsonl \
--model gpt-4o \
--temperature 0.7 \
--compare \
--output experiment_results.json
5. Batch Processing¶
# Process multiple traces
for trace in test_cases/*.jsonl; do
echo "Processing: $trace"
prela replay "$trace" --model gpt-4o --compare
done
CI/CD Integration¶
Automate regression testing in CI:
# .github/workflows/regression_test.yml
name: Regression Test
on:
pull_request:
branches: [main]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Install dependencies
run: |
pip install prela sentence-transformers
- name: Run regression tests
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: |
python scripts/regression_test.py
- name: Upload results
if: always()
uses: actions/upload-artifact@v3
with:
name: regression-results
path: regression_results/
scripts/regression_test.py:
#!/usr/bin/env python3
"""Regression test runner for CI."""
import glob
import json
import sys
from pathlib import Path
from prela.replay import ReplayEngine, compare_replays
from prela.replay.loader import TraceLoader
def main():
test_cases = glob.glob("test_traces/*.jsonl")
results = []
failed = []
for test_file in test_cases:
print(f"Testing: {test_file}")
trace = TraceLoader.from_file(test_file)
engine = ReplayEngine(trace)
original = engine.replay_exact()
modified = engine.replay_with_modifications(model="gpt-4o")
comparison = compare_replays(original, modified)
# Check similarity threshold
similarities = [
d.semantic_similarity
for d in comparison.differences
if d.semantic_similarity
]
avg_similarity = sum(similarities) / len(similarities) if similarities else 0
passed = avg_similarity >= 0.85
result = {
"test": test_file,
"passed": passed,
"similarity": avg_similarity,
"cost_delta": modified.total_cost_usd - original.total_cost_usd,
}
results.append(result)
if not passed:
failed.append(result)
# Save results
Path("regression_results").mkdir(exist_ok=True)
with open("regression_results/summary.json", "w") as f:
json.dump(results, f, indent=2)
# Print summary
print("\n" + "=" * 50)
print(f"Passed: {len(results) - len(failed)}/{len(results)}")
print(f"Failed: {len(failed)}")
if failed:
print("\nFailures:")
for f in failed:
print(f" - {f['test']} ({f['similarity']:.1%})")
sys.exit(1)
print("\n✓ All regression tests passed!")
if __name__ == "__main__":
main()
Next Steps¶
- Replay Concepts: Understand replay fundamentals
- CLI Reference: Complete CLI documentation
- API Reference: Detailed API documentation