"""Scoring functions for batch agent evaluation. Two scoring strategies: 1. **FieldMatchScorer** — deterministic check: for each expected record, find the best-matching actual record and compare specified fields. Returns precision, recall, and per-field accuracy. 2. **LLMJudgeScorer** — uses a secondary LLM to semantically evaluate whether the actual extractions satisfy the expected intent, even if wording differs. Returns a 0-1 score + reasoning. """ from __future__ import annotations import json import logging from dataclasses import dataclass, field from difflib import SequenceMatcher from typing import Any from langchain_core.messages import HumanMessage, SystemMessage logger = logging.getLogger(__name__) # ── Result types ───────────────────────────────────────────────────────── @dataclass class FieldScore: """Score for a single expected record against its best match.""" expected: dict[str, Any] best_match: dict[str, Any] | None matched_fields: dict[str, bool] similarity: float # 0-1 overall similarity @property def field_accuracy(self) -> float: if not self.matched_fields: return 0.0 return sum(self.matched_fields.values()) / len(self.matched_fields) @dataclass class EvalScores: """Aggregated scores for one eval run.""" fixture_name: str model: str prompt_variant: str field_scores: list[FieldScore] = field(default_factory=list) precision: float = 0.0 recall: float = 0.0 f1: float = 0.0 llm_judge_score: float | None = None llm_judge_reasoning: str = "" extra_records: int = 0 # records created but not expected missing_records: int = 0 # expected but not found @property def field_accuracy(self) -> float: if not self.field_scores: return 0.0 return sum(s.field_accuracy for s in self.field_scores) / len(self.field_scores) def summary(self) -> dict[str, Any]: return { "fixture": self.fixture_name, "model": self.model, "prompt_variant": self.prompt_variant, "precision": round(self.precision, 3), "recall": round(self.recall, 3), "f1": round(self.f1, 3), "field_accuracy": round(self.field_accuracy, 3), "llm_judge_score": round(self.llm_judge_score, 3) if self.llm_judge_score is not None else None, "extra_records": self.extra_records, "missing_records": self.missing_records, } # ── Field Match Scorer ─────────────────────────────────────────────────── def _normalize(value: Any) -> str: """Normalize a value for comparison.""" if value is None: return "" return str(value).strip().lower() def _text_similarity(a: str, b: str) -> float: """Fuzzy text similarity using SequenceMatcher.""" if not a and not b: return 1.0 if not a or not b: return 0.0 return SequenceMatcher(None, a.lower(), b.lower()).ratio() def _find_best_match( expected: dict[str, Any], actuals: list[dict[str, Any]], ) -> tuple[dict[str, Any] | None, float]: """Find the actual record most similar to expected, return (match, similarity).""" if not actuals: return None, 0.0 best_match = None best_score = 0.0 # Primary matching key: title or name expected_title = _normalize(expected.get("title", expected.get("name", ""))) for actual in actuals: actual_title = _normalize(actual.get("title", actual.get("name", ""))) sim = _text_similarity(expected_title, actual_title) if sim > best_score: best_score = sim best_match = actual return best_match, best_score def _compare_fields( expected: dict[str, Any], actual: dict[str, Any], ) -> dict[str, bool]: """Compare each expected field against the actual record.""" results: dict[str, bool] = {} for key, expected_val in expected.items(): actual_val = actual.get(key) # Exact match for non-string types if not isinstance(expected_val, str): results[key] = actual_val == expected_val else: # Fuzzy match for strings (threshold: 0.7) results[key] = _text_similarity( _normalize(expected_val), _normalize(actual_val) ) >= 0.7 return results def score_field_match( expected_records: list[dict[str, Any]], actual_records: list[dict[str, Any]], table: str, ) -> tuple[list[FieldScore], int, int]: """Score actual extractions against expected records for one table. Returns (field_scores, extra_count, missing_count). """ field_scores: list[FieldScore] = [] matched_actuals: set[int] = set() for exp in expected_records: # Find best match among unmatched actuals candidates = [ (i, a) for i, a in enumerate(actual_records) if i not in matched_actuals ] if not candidates: field_scores.append(FieldScore( expected=exp, best_match=None, matched_fields={}, similarity=0.0, )) continue best_idx, best_match = None, None best_sim = 0.0 for idx, actual in candidates: _, sim = _find_best_match(exp, [actual]) if sim > best_sim: best_sim = sim best_idx = idx best_match = actual if best_sim >= 0.5 and best_match is not None: matched_actuals.add(best_idx) matched_fields = _compare_fields(exp, best_match) field_scores.append(FieldScore( expected=exp, best_match=best_match, matched_fields=matched_fields, similarity=best_sim, )) else: field_scores.append(FieldScore( expected=exp, best_match=None, matched_fields={}, similarity=0.0, )) extra_count = len(actual_records) - len(matched_actuals) missing_count = sum(1 for s in field_scores if s.best_match is None) return field_scores, extra_count, missing_count def compute_precision_recall( expected_count: int, actual_count: int, matched_count: int, ) -> tuple[float, float, float]: """Compute precision, recall, F1.""" precision = matched_count / actual_count if actual_count > 0 else 0.0 recall = matched_count / expected_count if expected_count > 0 else 0.0 f1 = ( 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 ) return precision, recall, f1 # ── LLM Judge Scorer ───────────────────────────────────────────────────── _JUDGE_SYSTEM_PROMPT = """\ You are an evaluation judge for a data extraction system. Your task is to compare the EXPECTED extractions against the ACTUAL extractions produced by an AI agent, and assess quality on a 0-1 scale. Scoring criteria: - 1.0: All expected records found with correct fields, no significant extras - 0.8: Most expected records found, minor field differences or extras - 0.6: Core extractions present but some missing or incorrect - 0.4: Partial match — several expected records missing or wrong - 0.2: Poor quality — most expected records missing or incorrect - 0.0: Complete failure — no meaningful overlap Consider semantic equivalence: "Send invoice" and "Email the invoice" are matches. Ignore field ordering and formatting differences. Respond with ONLY a JSON object: {"score": 0.85, "reasoning": "Brief explanation of the score"} """ async def llm_judge_score( expected: list[dict[str, Any]], actual: list[dict[str, Any]], *, judge_model: str = "gpt-4o-mini", ) -> tuple[float, str]: """Use an LLM to semantically evaluate extraction quality. Returns (score, reasoning). """ from shared.llm import get_llm llm = get_llm(model=judge_model, temperature=0) user_content = ( f"## Expected extractions\n```json\n{json.dumps(expected, indent=2, default=str)}\n```\n\n" f"## Actual extractions\n```json\n{json.dumps(actual, indent=2, default=str)}\n```" ) try: response = await llm.ainvoke([ SystemMessage(content=_JUDGE_SYSTEM_PROMPT), HumanMessage(content=user_content), ]) raw = response.content.strip() if raw.startswith("```"): raw = raw.split("```")[1] if raw.startswith("json"): raw = raw[4:] parsed = json.loads(raw.strip()) return float(parsed.get("score", 0.0)), str(parsed.get("reasoning", "")) except Exception as exc: logger.warning("eval: LLM judge failed: %s", exc) return 0.0, f"Judge error: {exc}"