Evaluation Harness Pattern

LLM-as-Judge Evaluator (Python)

"""
Production evaluation harness - illustrative fragment.
Shows the pattern for LLM-as-judge with calibration.
"""
from dataclasses import dataclass
from enum import Enum
import anthropic
from typing import Optional
import json
import hashlib
import time
import asyncio

class Rating(Enum):
    EXCELLENT = 5
    GOOD = 4
    ACCEPTABLE = 3
    POOR = 2
    FAILED = 1

@dataclass
class EvalResult:
    query_id: str
    rating: Rating
    reasoning: str
    latency_ms: float
    token_count: int
    cost_usd: float
    retrieval_recall: Optional[float] = None

# Calibrated judge prompt with explicit rubric {.unnumbered}
JUDGE_PROMPT = """You are evaluating a RAG system response. Be strict and consistent.

## Query
{query}

## Retrieved Context
{context}

## System Response  
{response}

## Ground Truth (if available)
{ground_truth}

## Evaluation Rubric

Rate the response on this scale:

5 - EXCELLENT: Fully answers the query using retrieved context. All claims grounded. 
    No hallucinations. Appropriate caveats where needed.

4 - GOOD: Answers the query well. Minor issues (slight verbosity, missing minor detail).
    All major claims grounded in context.

3 - ACCEPTABLE: Partially answers query. Some relevant information but incomplete.
    No hallucinations but may miss key points from context.

2 - POOR: Fails to adequately answer. May include information not in context.
    Significant gaps or minor hallucinations.

1 - FAILED: Does not answer query, major hallucinations, or harmful content.
    Contradicts retrieved context.

## Instructions
1. First, identify what the query is asking for
2. Check if the response addresses this
3. Verify each claim against the retrieved context
4. Check for any information NOT in the context (potential hallucination)
5. Provide your rating and reasoning

Respond in JSON format:
{{"rating": <1-5>, "reasoning": "<detailed explanation>"}}
"""

class RAGEvaluator:
    def __init__(self, judge_model: str = "claude-3-5-sonnet-20241022"):
        self.client = anthropic.Anthropic()
        self.judge_model = judge_model
        
        # Calibration: run these periodically to check judge consistency
        self.calibration_set = self._load_calibration_examples()
    
    async def evaluate_single(
        self,
        query: str,
        context: list[str],
        response: str,
        ground_truth: Optional[str] = None
    ) -> EvalResult:
        """Evaluate a single RAG response."""
        
        prompt = JUDGE_PROMPT.format(
            query=query,
            context="\n---\n".join(context),
            response=response,
            ground_truth=ground_truth or "Not available"
        )
        
        start_time = time.time()
        
        result = await self.client.messages.create(
            model=self.judge_model,
            max_tokens=500,
            messages=[{"role": "user", "content": prompt}]
        )
        
        latency_ms = (time.time() - start_time) * 1000
        
        # Parse judge response
        try:
            judgment = json.loads(result.content[0].text)
            rating = Rating(judgment["rating"])
            reasoning = judgment["reasoning"]
        except (json.JSONDecodeError, KeyError, ValueError):
            rating = Rating.FAILED
            reasoning = f"Judge parse error: {result.content[0].text}"
        
        return EvalResult(
            query_id=hashlib.md5(query.encode()).hexdigest()[:8],
            rating=rating,
            reasoning=reasoning,
            latency_ms=latency_ms,
            token_count=result.usage.input_tokens + result.usage.output_tokens,
            cost_usd=self._calculate_cost(result.usage)
        )
    
    async def run_evaluation_suite(
        self,
        test_cases: list[dict],
        parallel: int = 10
    ) -> 'EvalReport':
        """Run full evaluation suite with parallel execution."""
        
        semaphore = asyncio.Semaphore(parallel)
        
        async def bounded_eval(case):
            async with semaphore:
                return await self.evaluate_single(**case)
        
        results = await asyncio.gather(*[
            bounded_eval(case) for case in test_cases
        ])
        
        return EvalReport(
            results=results,
            summary=self._compute_summary(results),
            timestamp=datetime.utcnow()
        )
    
    def _compute_summary(self, results: list[EvalResult]) -> dict:
        """Compute aggregate metrics."""
        ratings = [r.rating.value for r in results]
        
        return {
            "total_evaluated": len(results),
            "mean_rating": sum(ratings) / len(ratings),
            "rating_distribution": {
                r.name: sum(1 for x in results if x.rating == r)
                for r in Rating
            },
            "pass_rate": sum(1 for r in results if r.rating.value >= 3) / len(results),
            "mean_latency_ms": sum(r.latency_ms for r in results) / len(results),
            "total_cost_usd": sum(r.cost_usd for r in results)
        }
    
    def _calculate_cost(self, usage) -> float:
        """Calculate cost based on Claude 3.5 Sonnet pricing."""
        input_cost = (usage.input_tokens / 1_000_000) * 3
        output_cost = (usage.output_tokens / 1_000_000) * 15
        return input_cost + output_cost
    
    def _load_calibration_examples(self) -> list[dict]:
        """Load calibration set for judge consistency checks."""
        # In production: load from file or database
        return []

Calibration Set Structure

CALIBRATION_SET = [
    # Known good examples (should score 4-5)
    {"id": "cal_good_1", "expected_range": (4, 5), "category": "simple_qa"},
    {"id": "cal_good_2", "expected_range": (4, 5), "category": "complex_reasoning"},
    
    # Known bad examples (should score 1-2)  
    {"id": "cal_bad_1", "expected_range": (1, 2), "category": "hallucination"},
    {"id": "cal_bad_2", "expected_range": (1, 2), "category": "off_topic"},
    
    # Known medium examples (should score 2.5-3.5)
    {"id": "cal_med_1", "expected_range": (2.5, 3.5), "category": "partial_answer"},
    {"id": "cal_med_2", "expected_range": (2.5, 3.5), "category": "minor_errors"},
]

Usage Example

evaluator = RAGEvaluator()

# Single evaluation {.unnumbered}
result = await evaluator.evaluate_single(
    query="What is the refund policy?",
    context=["Refunds are available within 30 days of purchase..."],
    response="You can get a refund within 30 days.",
    ground_truth="30-day refund policy"
)

print(f"Rating: {result.rating.name}, Cost: ${result.cost_usd:.4f}")

# Batch evaluation {.unnumbered}
report = await evaluator.run_evaluation_suite(test_cases, parallel=20)
print(f"Pass rate: {report.summary['pass_rate']:.1%}")

--- number-sections: false execute: enabled: false --- # Evaluation Harness Pattern {.unnumbered} ## LLM-as-Judge Evaluator (Python) ```python """ Production evaluation harness - illustrative fragment. Shows the pattern for LLM-as-judge with calibration. """ from dataclasses import dataclass from enum import Enum import anthropic from typing import Optional import json import hashlib import time import asyncio class Rating(Enum): EXCELLENT = 5 GOOD = 4 ACCEPTABLE = 3 POOR = 2 FAILED = 1 @dataclass class EvalResult: query_id: str rating: Rating reasoning: str latency_ms: float token_count: int cost_usd: float retrieval_recall: Optional[float] = None # Calibrated judge prompt with explicit rubric {.unnumbered} JUDGE_PROMPT = """You are evaluating a RAG system response. Be strict and consistent. ## Query {query} ## Retrieved Context {context} ## System Response {response} ## Ground Truth (if available) {ground_truth} ## Evaluation Rubric Rate the response on this scale: 5 - EXCELLENT: Fully answers the query using retrieved context. All claims grounded. No hallucinations. Appropriate caveats where needed. 4 - GOOD: Answers the query well. Minor issues (slight verbosity, missing minor detail). All major claims grounded in context. 3 - ACCEPTABLE: Partially answers query. Some relevant information but incomplete. No hallucinations but may miss key points from context. 2 - POOR: Fails to adequately answer. May include information not in context. Significant gaps or minor hallucinations. 1 - FAILED: Does not answer query, major hallucinations, or harmful content. Contradicts retrieved context. ## Instructions 1. First, identify what the query is asking for 2. Check if the response addresses this 3. Verify each claim against the retrieved context 4. Check for any information NOT in the context (potential hallucination) 5. Provide your rating and reasoning Respond in JSON format: {{"rating": <1-5>, "reasoning": "<detailed explanation>"}} """ class RAGEvaluator: def __init__(self, judge_model: str = "claude-3-5-sonnet-20241022"): self.client = anthropic.Anthropic() self.judge_model = judge_model # Calibration: run these periodically to check judge consistency self.calibration_set = self._load_calibration_examples() async def evaluate_single( self, query: str, context: list[str], response: str, ground_truth: Optional[str] = None ) -> EvalResult: """Evaluate a single RAG response.""" prompt = JUDGE_PROMPT.format( query=query, context="\n---\n".join(context), response=response, ground_truth=ground_truth or "Not available" ) start_time = time.time() result = await self.client.messages.create( model=self.judge_model, max_tokens=500, messages=[{"role": "user", "content": prompt}] ) latency_ms = (time.time() - start_time) * 1000 # Parse judge response try: judgment = json.loads(result.content[0].text) rating = Rating(judgment["rating"]) reasoning = judgment["reasoning"] except (json.JSONDecodeError, KeyError, ValueError): rating = Rating.FAILED reasoning = f"Judge parse error: {result.content[0].text}" return EvalResult( query_id=hashlib.md5(query.encode()).hexdigest()[:8], rating=rating, reasoning=reasoning, latency_ms=latency_ms, token_count=result.usage.input_tokens + result.usage.output_tokens, cost_usd=self._calculate_cost(result.usage) ) async def run_evaluation_suite( self, test_cases: list[dict], parallel: int = 10 ) -> 'EvalReport': """Run full evaluation suite with parallel execution.""" semaphore = asyncio.Semaphore(parallel) async def bounded_eval(case): async with semaphore: return await self.evaluate_single(**case) results = await asyncio.gather(*[ bounded_eval(case) for case in test_cases ]) return EvalReport( results=results, summary=self._compute_summary(results), timestamp=datetime.utcnow() ) def _compute_summary(self, results: list[EvalResult]) -> dict: """Compute aggregate metrics.""" ratings = [r.rating.value for r in results] return { "total_evaluated": len(results), "mean_rating": sum(ratings) / len(ratings), "rating_distribution": { r.name: sum(1 for x in results if x.rating == r) for r in Rating }, "pass_rate": sum(1 for r in results if r.rating.value >= 3) / len(results), "mean_latency_ms": sum(r.latency_ms for r in results) / len(results), "total_cost_usd": sum(r.cost_usd for r in results) } def _calculate_cost(self, usage) -> float: """Calculate cost based on Claude 3.5 Sonnet pricing.""" input_cost = (usage.input_tokens / 1_000_000) * 3 output_cost = (usage.output_tokens / 1_000_000) * 15 return input_cost + output_cost def _load_calibration_examples(self) -> list[dict]: """Load calibration set for judge consistency checks.""" # In production: load from file or database return [] ``` ## Calibration Set Structure ```python CALIBRATION_SET = [ # Known good examples (should score 4-5) {"id": "cal_good_1", "expected_range": (4, 5), "category": "simple_qa"}, {"id": "cal_good_2", "expected_range": (4, 5), "category": "complex_reasoning"}, # Known bad examples (should score 1-2) {"id": "cal_bad_1", "expected_range": (1, 2), "category": "hallucination"}, {"id": "cal_bad_2", "expected_range": (1, 2), "category": "off_topic"}, # Known medium examples (should score 2.5-3.5) {"id": "cal_med_1", "expected_range": (2.5, 3.5), "category": "partial_answer"}, {"id": "cal_med_2", "expected_range": (2.5, 3.5), "category": "minor_errors"}, ] ``` ## Usage Example ```python evaluator = RAGEvaluator() # Single evaluation {.unnumbered} result = await evaluator.evaluate_single( query="What is the refund policy?", context=["Refunds are available within 30 days of purchase..."], response="You can get a refund within 30 days.", ground_truth="30-day refund policy" ) print(f"Rating: {result.rating.name}, Cost: ${result.cost_usd:.4f}") # Batch evaluation {.unnumbered} report = await evaluator.run_evaluation_suite(test_cases, parallel=20) print(f"Pass rate: {report.summary['pass_rate']:.1%}") ```