- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants - Rewrite runner with _run_step1, _run_step2, _run_full dispatch - CLI: replace --variants with --mode flag - Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full) - Remove old freelance_invoices fixture - Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full) - Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode - Langfuse: post separate classification_precision/recall/f1 scores for full mode - Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1) - Langfuse: include step1_results in trace output - MockExecutor: mock async_session to bypass DB in full mode - Journey fixture: remove user_messages (only interactive test kept)
269 lines
8.9 KiB
Python
269 lines
8.9 KiB
Python
"""Scoring functions for batch agent evaluation.
|
|
|
|
Two scoring strategies:
|
|
|
|
1. **FieldMatchScorer** — deterministic check: for each expected record,
|
|
find the best-matching actual record and compare specified fields.
|
|
Returns precision, recall, and per-field accuracy.
|
|
|
|
2. **LLMJudgeScorer** — uses a secondary LLM to semantically evaluate
|
|
whether the actual extractions satisfy the expected intent, even if
|
|
wording differs. Returns a 0-1 score + reasoning.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
from dataclasses import dataclass, field
|
|
from difflib import SequenceMatcher
|
|
from typing import Any
|
|
|
|
from langchain_core.messages import HumanMessage, SystemMessage
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ── Result types ─────────────────────────────────────────────────────────
|
|
|
|
|
|
@dataclass
|
|
class FieldScore:
|
|
"""Score for a single expected record against its best match."""
|
|
|
|
expected: dict[str, Any]
|
|
best_match: dict[str, Any] | None
|
|
matched_fields: dict[str, bool]
|
|
similarity: float # 0-1 overall similarity
|
|
|
|
@property
|
|
def field_accuracy(self) -> float:
|
|
if not self.matched_fields:
|
|
return 0.0
|
|
return sum(self.matched_fields.values()) / len(self.matched_fields)
|
|
|
|
|
|
@dataclass
|
|
class EvalScores:
|
|
"""Aggregated scores for one eval run."""
|
|
|
|
fixture_name: str
|
|
model: str
|
|
prompt_variant: str
|
|
field_scores: list[FieldScore] = field(default_factory=list)
|
|
precision: float = 0.0
|
|
recall: float = 0.0
|
|
f1: float = 0.0
|
|
llm_judge_score: float | None = None
|
|
llm_judge_reasoning: str = ""
|
|
extra_records: int = 0 # records created but not expected
|
|
missing_records: int = 0 # expected but not found
|
|
|
|
@property
|
|
def field_accuracy(self) -> float:
|
|
if not self.field_scores:
|
|
return 0.0
|
|
return sum(s.field_accuracy for s in self.field_scores) / len(self.field_scores)
|
|
|
|
def summary(self) -> dict[str, Any]:
|
|
return {
|
|
"fixture": self.fixture_name,
|
|
"model": self.model,
|
|
"prompt_variant": self.prompt_variant,
|
|
"precision": round(self.precision, 3),
|
|
"recall": round(self.recall, 3),
|
|
"f1": round(self.f1, 3),
|
|
"field_accuracy": round(self.field_accuracy, 3),
|
|
"llm_judge_score": round(self.llm_judge_score, 3) if self.llm_judge_score is not None else None,
|
|
"extra_records": self.extra_records,
|
|
"missing_records": self.missing_records,
|
|
}
|
|
|
|
|
|
# ── Field Match Scorer ───────────────────────────────────────────────────
|
|
|
|
|
|
def _normalize(value: Any) -> str:
|
|
"""Normalize a value for comparison."""
|
|
if value is None:
|
|
return ""
|
|
return str(value).strip().lower()
|
|
|
|
|
|
def _text_similarity(a: str, b: str) -> float:
|
|
"""Fuzzy text similarity using SequenceMatcher."""
|
|
if not a and not b:
|
|
return 1.0
|
|
if not a or not b:
|
|
return 0.0
|
|
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
|
|
|
|
|
def _find_best_match(
|
|
expected: dict[str, Any],
|
|
actuals: list[dict[str, Any]],
|
|
) -> tuple[dict[str, Any] | None, float]:
|
|
"""Find the actual record most similar to expected, return (match, similarity)."""
|
|
if not actuals:
|
|
return None, 0.0
|
|
|
|
best_match = None
|
|
best_score = 0.0
|
|
|
|
# Primary matching key: title or name
|
|
expected_title = _normalize(expected.get("title", expected.get("name", "")))
|
|
|
|
for actual in actuals:
|
|
actual_title = _normalize(actual.get("title", actual.get("name", "")))
|
|
sim = _text_similarity(expected_title, actual_title)
|
|
if sim > best_score:
|
|
best_score = sim
|
|
best_match = actual
|
|
|
|
return best_match, best_score
|
|
|
|
|
|
def _compare_fields(
|
|
expected: dict[str, Any],
|
|
actual: dict[str, Any],
|
|
) -> dict[str, bool]:
|
|
"""Compare each expected field against the actual record."""
|
|
results: dict[str, bool] = {}
|
|
for key, expected_val in expected.items():
|
|
actual_val = actual.get(key)
|
|
# Exact match for non-string types
|
|
if not isinstance(expected_val, str):
|
|
results[key] = actual_val == expected_val
|
|
else:
|
|
# Fuzzy match for strings (threshold: 0.7)
|
|
results[key] = _text_similarity(
|
|
_normalize(expected_val), _normalize(actual_val)
|
|
) >= 0.7
|
|
return results
|
|
|
|
|
|
def score_field_match(
|
|
expected_records: list[dict[str, Any]],
|
|
actual_records: list[dict[str, Any]],
|
|
table: str,
|
|
) -> tuple[list[FieldScore], int, int]:
|
|
"""Score actual extractions against expected records for one table.
|
|
|
|
Returns (field_scores, extra_count, missing_count).
|
|
"""
|
|
field_scores: list[FieldScore] = []
|
|
matched_actuals: set[int] = set()
|
|
|
|
for exp in expected_records:
|
|
# Find best match among unmatched actuals
|
|
candidates = [
|
|
(i, a) for i, a in enumerate(actual_records) if i not in matched_actuals
|
|
]
|
|
if not candidates:
|
|
field_scores.append(FieldScore(
|
|
expected=exp, best_match=None, matched_fields={}, similarity=0.0,
|
|
))
|
|
continue
|
|
|
|
best_idx, best_match = None, None
|
|
best_sim = 0.0
|
|
for idx, actual in candidates:
|
|
_, sim = _find_best_match(exp, [actual])
|
|
if sim > best_sim:
|
|
best_sim = sim
|
|
best_idx = idx
|
|
best_match = actual
|
|
|
|
if best_sim >= 0.5 and best_match is not None:
|
|
matched_actuals.add(best_idx)
|
|
matched_fields = _compare_fields(exp, best_match)
|
|
field_scores.append(FieldScore(
|
|
expected=exp, best_match=best_match,
|
|
matched_fields=matched_fields, similarity=best_sim,
|
|
))
|
|
else:
|
|
field_scores.append(FieldScore(
|
|
expected=exp, best_match=None, matched_fields={}, similarity=0.0,
|
|
))
|
|
|
|
extra_count = len(actual_records) - len(matched_actuals)
|
|
missing_count = sum(1 for s in field_scores if s.best_match is None)
|
|
|
|
return field_scores, extra_count, missing_count
|
|
|
|
|
|
def compute_precision_recall(
|
|
expected_count: int,
|
|
actual_count: int,
|
|
matched_count: int,
|
|
) -> tuple[float, float, float]:
|
|
"""Compute precision, recall, F1."""
|
|
precision = matched_count / actual_count if actual_count > 0 else 0.0
|
|
recall = matched_count / expected_count if expected_count > 0 else 0.0
|
|
f1 = (
|
|
2 * precision * recall / (precision + recall)
|
|
if (precision + recall) > 0
|
|
else 0.0
|
|
)
|
|
return precision, recall, f1
|
|
|
|
|
|
# ── LLM Judge Scorer ─────────────────────────────────────────────────────
|
|
|
|
_JUDGE_SYSTEM_PROMPT = """\
|
|
You are an evaluation judge for a data extraction system.
|
|
|
|
Your task is to compare the EXPECTED extractions against the ACTUAL extractions
|
|
produced by an AI agent, and assess quality on a 0-1 scale.
|
|
|
|
Scoring criteria:
|
|
- 1.0: All expected records found with correct fields, no significant extras
|
|
- 0.8: Most expected records found, minor field differences or extras
|
|
- 0.6: Core extractions present but some missing or incorrect
|
|
- 0.4: Partial match — several expected records missing or wrong
|
|
- 0.2: Poor quality — most expected records missing or incorrect
|
|
- 0.0: Complete failure — no meaningful overlap
|
|
|
|
Consider semantic equivalence: "Send invoice" and "Email the invoice" are matches.
|
|
Ignore field ordering and formatting differences.
|
|
|
|
Respond with ONLY a JSON object:
|
|
{"score": 0.85, "reasoning": "Brief explanation of the score"}
|
|
"""
|
|
|
|
|
|
async def llm_judge_score(
|
|
expected: list[dict[str, Any]],
|
|
actual: list[dict[str, Any]],
|
|
*,
|
|
judge_model: str = "gpt-4o-mini",
|
|
) -> tuple[float, str]:
|
|
"""Use an LLM to semantically evaluate extraction quality.
|
|
|
|
Returns (score, reasoning).
|
|
"""
|
|
from shared.llm import get_llm
|
|
|
|
llm = get_llm(model=judge_model, temperature=0)
|
|
|
|
user_content = (
|
|
f"## Expected extractions\n```json\n{json.dumps(expected, indent=2, default=str)}\n```\n\n"
|
|
f"## Actual extractions\n```json\n{json.dumps(actual, indent=2, default=str)}\n```"
|
|
)
|
|
|
|
try:
|
|
response = await llm.ainvoke([
|
|
SystemMessage(content=_JUDGE_SYSTEM_PROMPT),
|
|
HumanMessage(content=user_content),
|
|
])
|
|
raw = response.content.strip()
|
|
if raw.startswith("```"):
|
|
raw = raw.split("```")[1]
|
|
if raw.startswith("json"):
|
|
raw = raw[4:]
|
|
parsed = json.loads(raw.strip())
|
|
return float(parsed.get("score", 0.0)), str(parsed.get("reasoning", ""))
|
|
except Exception as exc:
|
|
logger.warning("eval: LLM judge failed: %s", exc)
|
|
return 0.0, f"Judge error: {exc}"
|