Files
api/services/batch-agent/eval/scorer.py
Roberto Musso d3f7099d93 refactor(eval): 3-mode eval harness (step1/step2/full) with Langfuse fixes
- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants
- Rewrite runner with _run_step1, _run_step2, _run_full dispatch
- CLI: replace --variants with --mode flag
- Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full)
- Remove old freelance_invoices fixture
- Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full)
- Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode
- Langfuse: post separate classification_precision/recall/f1 scores for full mode
- Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1)
- Langfuse: include step1_results in trace output
- MockExecutor: mock async_session to bypass DB in full mode
- Journey fixture: remove user_messages (only interactive test kept)
2026-03-24 16:18:51 +01:00

269 lines
8.9 KiB
Python

"""Scoring functions for batch agent evaluation.
Two scoring strategies:
1. **FieldMatchScorer** — deterministic check: for each expected record,
find the best-matching actual record and compare specified fields.
Returns precision, recall, and per-field accuracy.
2. **LLMJudgeScorer** — uses a secondary LLM to semantically evaluate
whether the actual extractions satisfy the expected intent, even if
wording differs. Returns a 0-1 score + reasoning.
"""
from __future__ import annotations
import json
import logging
from dataclasses import dataclass, field
from difflib import SequenceMatcher
from typing import Any
from langchain_core.messages import HumanMessage, SystemMessage
logger = logging.getLogger(__name__)
# ── Result types ─────────────────────────────────────────────────────────
@dataclass
class FieldScore:
"""Score for a single expected record against its best match."""
expected: dict[str, Any]
best_match: dict[str, Any] | None
matched_fields: dict[str, bool]
similarity: float # 0-1 overall similarity
@property
def field_accuracy(self) -> float:
if not self.matched_fields:
return 0.0
return sum(self.matched_fields.values()) / len(self.matched_fields)
@dataclass
class EvalScores:
"""Aggregated scores for one eval run."""
fixture_name: str
model: str
prompt_variant: str
field_scores: list[FieldScore] = field(default_factory=list)
precision: float = 0.0
recall: float = 0.0
f1: float = 0.0
llm_judge_score: float | None = None
llm_judge_reasoning: str = ""
extra_records: int = 0 # records created but not expected
missing_records: int = 0 # expected but not found
@property
def field_accuracy(self) -> float:
if not self.field_scores:
return 0.0
return sum(s.field_accuracy for s in self.field_scores) / len(self.field_scores)
def summary(self) -> dict[str, Any]:
return {
"fixture": self.fixture_name,
"model": self.model,
"prompt_variant": self.prompt_variant,
"precision": round(self.precision, 3),
"recall": round(self.recall, 3),
"f1": round(self.f1, 3),
"field_accuracy": round(self.field_accuracy, 3),
"llm_judge_score": round(self.llm_judge_score, 3) if self.llm_judge_score is not None else None,
"extra_records": self.extra_records,
"missing_records": self.missing_records,
}
# ── Field Match Scorer ───────────────────────────────────────────────────
def _normalize(value: Any) -> str:
"""Normalize a value for comparison."""
if value is None:
return ""
return str(value).strip().lower()
def _text_similarity(a: str, b: str) -> float:
"""Fuzzy text similarity using SequenceMatcher."""
if not a and not b:
return 1.0
if not a or not b:
return 0.0
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
def _find_best_match(
expected: dict[str, Any],
actuals: list[dict[str, Any]],
) -> tuple[dict[str, Any] | None, float]:
"""Find the actual record most similar to expected, return (match, similarity)."""
if not actuals:
return None, 0.0
best_match = None
best_score = 0.0
# Primary matching key: title or name
expected_title = _normalize(expected.get("title", expected.get("name", "")))
for actual in actuals:
actual_title = _normalize(actual.get("title", actual.get("name", "")))
sim = _text_similarity(expected_title, actual_title)
if sim > best_score:
best_score = sim
best_match = actual
return best_match, best_score
def _compare_fields(
expected: dict[str, Any],
actual: dict[str, Any],
) -> dict[str, bool]:
"""Compare each expected field against the actual record."""
results: dict[str, bool] = {}
for key, expected_val in expected.items():
actual_val = actual.get(key)
# Exact match for non-string types
if not isinstance(expected_val, str):
results[key] = actual_val == expected_val
else:
# Fuzzy match for strings (threshold: 0.7)
results[key] = _text_similarity(
_normalize(expected_val), _normalize(actual_val)
) >= 0.7
return results
def score_field_match(
expected_records: list[dict[str, Any]],
actual_records: list[dict[str, Any]],
table: str,
) -> tuple[list[FieldScore], int, int]:
"""Score actual extractions against expected records for one table.
Returns (field_scores, extra_count, missing_count).
"""
field_scores: list[FieldScore] = []
matched_actuals: set[int] = set()
for exp in expected_records:
# Find best match among unmatched actuals
candidates = [
(i, a) for i, a in enumerate(actual_records) if i not in matched_actuals
]
if not candidates:
field_scores.append(FieldScore(
expected=exp, best_match=None, matched_fields={}, similarity=0.0,
))
continue
best_idx, best_match = None, None
best_sim = 0.0
for idx, actual in candidates:
_, sim = _find_best_match(exp, [actual])
if sim > best_sim:
best_sim = sim
best_idx = idx
best_match = actual
if best_sim >= 0.5 and best_match is not None:
matched_actuals.add(best_idx)
matched_fields = _compare_fields(exp, best_match)
field_scores.append(FieldScore(
expected=exp, best_match=best_match,
matched_fields=matched_fields, similarity=best_sim,
))
else:
field_scores.append(FieldScore(
expected=exp, best_match=None, matched_fields={}, similarity=0.0,
))
extra_count = len(actual_records) - len(matched_actuals)
missing_count = sum(1 for s in field_scores if s.best_match is None)
return field_scores, extra_count, missing_count
def compute_precision_recall(
expected_count: int,
actual_count: int,
matched_count: int,
) -> tuple[float, float, float]:
"""Compute precision, recall, F1."""
precision = matched_count / actual_count if actual_count > 0 else 0.0
recall = matched_count / expected_count if expected_count > 0 else 0.0
f1 = (
2 * precision * recall / (precision + recall)
if (precision + recall) > 0
else 0.0
)
return precision, recall, f1
# ── LLM Judge Scorer ─────────────────────────────────────────────────────
_JUDGE_SYSTEM_PROMPT = """\
You are an evaluation judge for a data extraction system.
Your task is to compare the EXPECTED extractions against the ACTUAL extractions
produced by an AI agent, and assess quality on a 0-1 scale.
Scoring criteria:
- 1.0: All expected records found with correct fields, no significant extras
- 0.8: Most expected records found, minor field differences or extras
- 0.6: Core extractions present but some missing or incorrect
- 0.4: Partial match — several expected records missing or wrong
- 0.2: Poor quality — most expected records missing or incorrect
- 0.0: Complete failure — no meaningful overlap
Consider semantic equivalence: "Send invoice" and "Email the invoice" are matches.
Ignore field ordering and formatting differences.
Respond with ONLY a JSON object:
{"score": 0.85, "reasoning": "Brief explanation of the score"}
"""
async def llm_judge_score(
expected: list[dict[str, Any]],
actual: list[dict[str, Any]],
*,
judge_model: str = "gpt-4o-mini",
) -> tuple[float, str]:
"""Use an LLM to semantically evaluate extraction quality.
Returns (score, reasoning).
"""
from shared.llm import get_llm
llm = get_llm(model=judge_model, temperature=0)
user_content = (
f"## Expected extractions\n```json\n{json.dumps(expected, indent=2, default=str)}\n```\n\n"
f"## Actual extractions\n```json\n{json.dumps(actual, indent=2, default=str)}\n```"
)
try:
response = await llm.ainvoke([
SystemMessage(content=_JUDGE_SYSTEM_PROMPT),
HumanMessage(content=user_content),
])
raw = response.content.strip()
if raw.startswith("```"):
raw = raw.split("```")[1]
if raw.startswith("json"):
raw = raw[4:]
parsed = json.loads(raw.strip())
return float(parsed.get("score", 0.0)), str(parsed.get("reasoning", ""))
except Exception as exc:
logger.warning("eval: LLM judge failed: %s", exc)
return 0.0, f"Judge error: {exc}"