"""Production evaluation harness - illustrative fragment.Shows the pattern for LLM-as-judge with calibration."""from dataclasses import dataclassfrom enum import Enumimport anthropicfrom typing import Optionalimport jsonimport hashlibimport timeimport asyncioclass Rating(Enum): EXCELLENT =5 GOOD =4 ACCEPTABLE =3 POOR =2 FAILED =1@dataclassclass EvalResult: query_id: str rating: Rating reasoning: str latency_ms: float token_count: int cost_usd: float retrieval_recall: Optional[float] =None# Calibrated judge prompt with explicit rubric {.unnumbered}JUDGE_PROMPT ="""You are evaluating a RAG system response. Be strict and consistent.## Query{query}## Retrieved Context{context}## System Response {response}## Ground Truth (if available){ground_truth}## Evaluation RubricRate the response on this scale:5 - EXCELLENT: Fully answers the query using retrieved context. All claims grounded. No hallucinations. Appropriate caveats where needed.4 - GOOD: Answers the query well. Minor issues (slight verbosity, missing minor detail). All major claims grounded in context.3 - ACCEPTABLE: Partially answers query. Some relevant information but incomplete. No hallucinations but may miss key points from context.2 - POOR: Fails to adequately answer. May include information not in context. Significant gaps or minor hallucinations.1 - FAILED: Does not answer query, major hallucinations, or harmful content. Contradicts retrieved context.## Instructions1. First, identify what the query is asking for2. Check if the response addresses this3. Verify each claim against the retrieved context4. Check for any information NOT in the context (potential hallucination)5. Provide your rating and reasoningRespond in JSON format:{{"rating": <1-5>, "reasoning": "<detailed explanation>"}}"""class RAGEvaluator:def__init__(self, judge_model: str="claude-3-5-sonnet-20241022"):self.client = anthropic.Anthropic()self.judge_model = judge_model# Calibration: run these periodically to check judge consistencyself.calibration_set =self._load_calibration_examples()asyncdef evaluate_single(self, query: str, context: list[str], response: str, ground_truth: Optional[str] =None ) -> EvalResult:"""Evaluate a single RAG response.""" prompt = JUDGE_PROMPT.format( query=query, context="\n---\n".join(context), response=response, ground_truth=ground_truth or"Not available" ) start_time = time.time() result =awaitself.client.messages.create( model=self.judge_model, max_tokens=500, messages=[{"role": "user", "content": prompt}] ) latency_ms = (time.time() - start_time) *1000# Parse judge responsetry: judgment = json.loads(result.content[0].text) rating = Rating(judgment["rating"]) reasoning = judgment["reasoning"]except (json.JSONDecodeError, KeyError, ValueError): rating = Rating.FAILED reasoning =f"Judge parse error: {result.content[0].text}"return EvalResult( query_id=hashlib.md5(query.encode()).hexdigest()[:8], rating=rating, reasoning=reasoning, latency_ms=latency_ms, token_count=result.usage.input_tokens + result.usage.output_tokens, cost_usd=self._calculate_cost(result.usage) )asyncdef run_evaluation_suite(self, test_cases: list[dict], parallel: int=10 ) ->'EvalReport':"""Run full evaluation suite with parallel execution.""" semaphore = asyncio.Semaphore(parallel)asyncdef bounded_eval(case):asyncwith semaphore:returnawaitself.evaluate_single(**case) results =await asyncio.gather(*[ bounded_eval(case) for case in test_cases ])return EvalReport( results=results, summary=self._compute_summary(results), timestamp=datetime.utcnow() )def _compute_summary(self, results: list[EvalResult]) ->dict:"""Compute aggregate metrics.""" ratings = [r.rating.value for r in results]return {"total_evaluated": len(results),"mean_rating": sum(ratings) /len(ratings),"rating_distribution": { r.name: sum(1for x in results if x.rating == r)for r in Rating },"pass_rate": sum(1for r in results if r.rating.value >=3) /len(results),"mean_latency_ms": sum(r.latency_ms for r in results) /len(results),"total_cost_usd": sum(r.cost_usd for r in results) }def _calculate_cost(self, usage) ->float:"""Calculate cost based on Claude 3.5 Sonnet pricing.""" input_cost = (usage.input_tokens /1_000_000) *3 output_cost = (usage.output_tokens /1_000_000) *15return input_cost + output_costdef _load_calibration_examples(self) ->list[dict]:"""Load calibration set for judge consistency checks."""# In production: load from file or databasereturn []
evaluator = RAGEvaluator()# Single evaluation {.unnumbered}result =await evaluator.evaluate_single( query="What is the refund policy?", context=["Refunds are available within 30 days of purchase..."], response="You can get a refund within 30 days.", ground_truth="30-day refund policy")print(f"Rating: {result.rating.name}, Cost: ${result.cost_usd:.4f}")# Batch evaluation {.unnumbered}report =await evaluator.run_evaluation_suite(test_cases, parallel=20)print(f"Pass rate: {report.summary['pass_rate']:.1%}")
---number-sections: falseexecute: enabled: false---# Evaluation Harness Pattern {.unnumbered}## LLM-as-Judge Evaluator (Python)```python"""Production evaluation harness - illustrative fragment.Shows the pattern for LLM-as-judge with calibration."""from dataclasses import dataclassfrom enum import Enumimport anthropicfrom typing import Optionalimport jsonimport hashlibimport timeimport asyncioclass Rating(Enum): EXCELLENT =5 GOOD =4 ACCEPTABLE =3 POOR =2 FAILED =1@dataclassclass EvalResult: query_id: str rating: Rating reasoning: str latency_ms: float token_count: int cost_usd: float retrieval_recall: Optional[float] =None# Calibrated judge prompt with explicit rubric {.unnumbered}JUDGE_PROMPT ="""You are evaluating a RAG system response. Be strict and consistent.## Query{query}## Retrieved Context{context}## System Response {response}## Ground Truth (if available){ground_truth}## Evaluation RubricRate the response on this scale:5 - EXCELLENT: Fully answers the query using retrieved context. All claims grounded. No hallucinations. Appropriate caveats where needed.4 - GOOD: Answers the query well. Minor issues (slight verbosity, missing minor detail). All major claims grounded in context.3 - ACCEPTABLE: Partially answers query. Some relevant information but incomplete. No hallucinations but may miss key points from context.2 - POOR: Fails to adequately answer. May include information not in context. Significant gaps or minor hallucinations.1 - FAILED: Does not answer query, major hallucinations, or harmful content. Contradicts retrieved context.## Instructions1. First, identify what the query is asking for2. Check if the response addresses this3. Verify each claim against the retrieved context4. Check for any information NOT in the context (potential hallucination)5. Provide your rating and reasoningRespond in JSON format:{{"rating": <1-5>, "reasoning": "<detailed explanation>"}}"""class RAGEvaluator:def__init__(self, judge_model: str="claude-3-5-sonnet-20241022"):self.client = anthropic.Anthropic()self.judge_model = judge_model# Calibration: run these periodically to check judge consistencyself.calibration_set =self._load_calibration_examples()asyncdef evaluate_single(self, query: str, context: list[str], response: str, ground_truth: Optional[str] =None ) -> EvalResult:"""Evaluate a single RAG response.""" prompt = JUDGE_PROMPT.format( query=query, context="\n---\n".join(context), response=response, ground_truth=ground_truth or"Not available" ) start_time = time.time() result =awaitself.client.messages.create( model=self.judge_model, max_tokens=500, messages=[{"role": "user", "content": prompt}] ) latency_ms = (time.time() - start_time) *1000# Parse judge responsetry: judgment = json.loads(result.content[0].text) rating = Rating(judgment["rating"]) reasoning = judgment["reasoning"]except (json.JSONDecodeError, KeyError, ValueError): rating = Rating.FAILED reasoning =f"Judge parse error: {result.content[0].text}"return EvalResult( query_id=hashlib.md5(query.encode()).hexdigest()[:8], rating=rating, reasoning=reasoning, latency_ms=latency_ms, token_count=result.usage.input_tokens + result.usage.output_tokens, cost_usd=self._calculate_cost(result.usage) )asyncdef run_evaluation_suite(self, test_cases: list[dict], parallel: int=10 ) ->'EvalReport':"""Run full evaluation suite with parallel execution.""" semaphore = asyncio.Semaphore(parallel)asyncdef bounded_eval(case):asyncwith semaphore:returnawaitself.evaluate_single(**case) results =await asyncio.gather(*[ bounded_eval(case) for case in test_cases ])return EvalReport( results=results, summary=self._compute_summary(results), timestamp=datetime.utcnow() )def _compute_summary(self, results: list[EvalResult]) ->dict:"""Compute aggregate metrics.""" ratings = [r.rating.value for r in results]return {"total_evaluated": len(results),"mean_rating": sum(ratings) /len(ratings),"rating_distribution": { r.name: sum(1for x in results if x.rating == r)for r in Rating },"pass_rate": sum(1for r in results if r.rating.value >=3) /len(results),"mean_latency_ms": sum(r.latency_ms for r in results) /len(results),"total_cost_usd": sum(r.cost_usd for r in results) }def _calculate_cost(self, usage) ->float:"""Calculate cost based on Claude 3.5 Sonnet pricing.""" input_cost = (usage.input_tokens /1_000_000) *3 output_cost = (usage.output_tokens /1_000_000) *15return input_cost + output_costdef _load_calibration_examples(self) ->list[dict]:"""Load calibration set for judge consistency checks."""# In production: load from file or databasereturn []```## Calibration Set Structure```pythonCALIBRATION_SET = [# Known good examples (should score 4-5) {"id": "cal_good_1", "expected_range": (4, 5), "category": "simple_qa"}, {"id": "cal_good_2", "expected_range": (4, 5), "category": "complex_reasoning"},# Known bad examples (should score 1-2) {"id": "cal_bad_1", "expected_range": (1, 2), "category": "hallucination"}, {"id": "cal_bad_2", "expected_range": (1, 2), "category": "off_topic"},# Known medium examples (should score 2.5-3.5) {"id": "cal_med_1", "expected_range": (2.5, 3.5), "category": "partial_answer"}, {"id": "cal_med_2", "expected_range": (2.5, 3.5), "category": "minor_errors"},]```## Usage Example```pythonevaluator = RAGEvaluator()# Single evaluation {.unnumbered}result =await evaluator.evaluate_single( query="What is the refund policy?", context=["Refunds are available within 30 days of purchase..."], response="You can get a refund within 30 days.", ground_truth="30-day refund policy")print(f"Rating: {result.rating.name}, Cost: ${result.cost_usd:.4f}")# Batch evaluation {.unnumbered}report =await evaluator.run_evaluation_suite(test_cases, parallel=20)print(f"Pass rate: {report.summary['pass_rate']:.1%}")```