feat(batch-agent): add E2E evaluation harness with Langfuse integration
- eval/mock_executor.py: intercepts execute_on_client, serves fixture files from disk, records all mutations (insert/update/delete) - eval/config.py: YAML fixture loader with prompt variants, expected results, seed records, model overrides - eval/scorer.py: FieldMatchScorer (fuzzy title match, per-field accuracy, precision/recall/F1) + LLMJudgeScorer (semantic eval) - eval/langfuse_eval.py: sync fixtures to Langfuse datasets, create dataset runs, post scores, link traces to runs - eval/runner.py: orchestrates fixture → mock → agent pipeline → scoring → Langfuse reporting - eval/cli.py: CLI (python -m eval run/list/sync) with --models, --variants, --fixture, --no-judge flags - eval/fixtures/: example Italian freelance scenario with 3 prompt variants (baseline, detailed_italian, minimal)
This commit is contained in:
268
services/batch-agent/eval/scorer.py
Normal file
268
services/batch-agent/eval/scorer.py
Normal file
@@ -0,0 +1,268 @@
|
||||
"""Scoring functions for batch agent evaluation.
|
||||
|
||||
Two scoring strategies:
|
||||
|
||||
1. **FieldMatchScorer** — deterministic check: for each expected record,
|
||||
find the best-matching actual record and compare specified fields.
|
||||
Returns precision, recall, and per-field accuracy.
|
||||
|
||||
2. **LLMJudgeScorer** — uses a secondary LLM to semantically evaluate
|
||||
whether the actual extractions satisfy the expected intent, even if
|
||||
wording differs. Returns a 0-1 score + reasoning.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from difflib import SequenceMatcher
|
||||
from typing import Any
|
||||
|
||||
from langchain_core.messages import HumanMessage, SystemMessage
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ── Result types ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@dataclass
|
||||
class FieldScore:
|
||||
"""Score for a single expected record against its best match."""
|
||||
|
||||
expected: dict[str, Any]
|
||||
best_match: dict[str, Any] | None
|
||||
matched_fields: dict[str, bool]
|
||||
similarity: float # 0-1 overall similarity
|
||||
|
||||
@property
|
||||
def field_accuracy(self) -> float:
|
||||
if not self.matched_fields:
|
||||
return 0.0
|
||||
return sum(self.matched_fields.values()) / len(self.matched_fields)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvalScores:
|
||||
"""Aggregated scores for one eval run."""
|
||||
|
||||
fixture_name: str
|
||||
model: str
|
||||
prompt_variant: str
|
||||
field_scores: list[FieldScore] = field(default_factory=list)
|
||||
precision: float = 0.0
|
||||
recall: float = 0.0
|
||||
f1: float = 0.0
|
||||
llm_judge_score: float | None = None
|
||||
llm_judge_reasoning: str = ""
|
||||
extra_records: int = 0 # records created but not expected
|
||||
missing_records: int = 0 # expected but not found
|
||||
|
||||
@property
|
||||
def field_accuracy(self) -> float:
|
||||
if not self.field_scores:
|
||||
return 0.0
|
||||
return sum(s.field_accuracy for s in self.field_scores) / len(self.field_scores)
|
||||
|
||||
def summary(self) -> dict[str, Any]:
|
||||
return {
|
||||
"fixture": self.fixture_name,
|
||||
"model": self.model,
|
||||
"prompt_variant": self.prompt_variant,
|
||||
"precision": round(self.precision, 3),
|
||||
"recall": round(self.recall, 3),
|
||||
"f1": round(self.f1, 3),
|
||||
"field_accuracy": round(self.field_accuracy, 3),
|
||||
"llm_judge_score": round(self.llm_judge_score, 3) if self.llm_judge_score is not None else None,
|
||||
"extra_records": self.extra_records,
|
||||
"missing_records": self.missing_records,
|
||||
}
|
||||
|
||||
|
||||
# ── Field Match Scorer ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _normalize(value: Any) -> str:
|
||||
"""Normalize a value for comparison."""
|
||||
if value is None:
|
||||
return ""
|
||||
return str(value).strip().lower()
|
||||
|
||||
|
||||
def _text_similarity(a: str, b: str) -> float:
|
||||
"""Fuzzy text similarity using SequenceMatcher."""
|
||||
if not a and not b:
|
||||
return 1.0
|
||||
if not a or not b:
|
||||
return 0.0
|
||||
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
||||
|
||||
|
||||
def _find_best_match(
|
||||
expected: dict[str, Any],
|
||||
actuals: list[dict[str, Any]],
|
||||
) -> tuple[dict[str, Any] | None, float]:
|
||||
"""Find the actual record most similar to expected, return (match, similarity)."""
|
||||
if not actuals:
|
||||
return None, 0.0
|
||||
|
||||
best_match = None
|
||||
best_score = 0.0
|
||||
|
||||
# Primary matching key: title or name
|
||||
expected_title = _normalize(expected.get("title", expected.get("name", "")))
|
||||
|
||||
for actual in actuals:
|
||||
actual_title = _normalize(actual.get("title", actual.get("name", "")))
|
||||
sim = _text_similarity(expected_title, actual_title)
|
||||
if sim > best_score:
|
||||
best_score = sim
|
||||
best_match = actual
|
||||
|
||||
return best_match, best_score
|
||||
|
||||
|
||||
def _compare_fields(
|
||||
expected: dict[str, Any],
|
||||
actual: dict[str, Any],
|
||||
) -> dict[str, bool]:
|
||||
"""Compare each expected field against the actual record."""
|
||||
results: dict[str, bool] = {}
|
||||
for key, expected_val in expected.items():
|
||||
actual_val = actual.get(key)
|
||||
# Exact match for non-string types
|
||||
if not isinstance(expected_val, str):
|
||||
results[key] = actual_val == expected_val
|
||||
else:
|
||||
# Fuzzy match for strings (threshold: 0.7)
|
||||
results[key] = _text_similarity(
|
||||
_normalize(expected_val), _normalize(actual_val)
|
||||
) >= 0.7
|
||||
return results
|
||||
|
||||
|
||||
def score_field_match(
|
||||
expected_records: list[dict[str, Any]],
|
||||
actual_records: list[dict[str, Any]],
|
||||
table: str,
|
||||
) -> tuple[list[FieldScore], int, int]:
|
||||
"""Score actual extractions against expected records for one table.
|
||||
|
||||
Returns (field_scores, extra_count, missing_count).
|
||||
"""
|
||||
field_scores: list[FieldScore] = []
|
||||
matched_actuals: set[int] = set()
|
||||
|
||||
for exp in expected_records:
|
||||
# Find best match among unmatched actuals
|
||||
candidates = [
|
||||
(i, a) for i, a in enumerate(actual_records) if i not in matched_actuals
|
||||
]
|
||||
if not candidates:
|
||||
field_scores.append(FieldScore(
|
||||
expected=exp, best_match=None, matched_fields={}, similarity=0.0,
|
||||
))
|
||||
continue
|
||||
|
||||
best_idx, best_match = None, None
|
||||
best_sim = 0.0
|
||||
for idx, actual in candidates:
|
||||
_, sim = _find_best_match(exp, [actual])
|
||||
if sim > best_sim:
|
||||
best_sim = sim
|
||||
best_idx = idx
|
||||
best_match = actual
|
||||
|
||||
if best_sim >= 0.5 and best_match is not None:
|
||||
matched_actuals.add(best_idx)
|
||||
matched_fields = _compare_fields(exp, best_match)
|
||||
field_scores.append(FieldScore(
|
||||
expected=exp, best_match=best_match,
|
||||
matched_fields=matched_fields, similarity=best_sim,
|
||||
))
|
||||
else:
|
||||
field_scores.append(FieldScore(
|
||||
expected=exp, best_match=None, matched_fields={}, similarity=0.0,
|
||||
))
|
||||
|
||||
extra_count = len(actual_records) - len(matched_actuals)
|
||||
missing_count = sum(1 for s in field_scores if s.best_match is None)
|
||||
|
||||
return field_scores, extra_count, missing_count
|
||||
|
||||
|
||||
def compute_precision_recall(
|
||||
expected_count: int,
|
||||
actual_count: int,
|
||||
matched_count: int,
|
||||
) -> tuple[float, float, float]:
|
||||
"""Compute precision, recall, F1."""
|
||||
precision = matched_count / actual_count if actual_count > 0 else 0.0
|
||||
recall = matched_count / expected_count if expected_count > 0 else 0.0
|
||||
f1 = (
|
||||
2 * precision * recall / (precision + recall)
|
||||
if (precision + recall) > 0
|
||||
else 0.0
|
||||
)
|
||||
return precision, recall, f1
|
||||
|
||||
|
||||
# ── LLM Judge Scorer ─────────────────────────────────────────────────────
|
||||
|
||||
_JUDGE_SYSTEM_PROMPT = """\
|
||||
You are an evaluation judge for a data extraction system.
|
||||
|
||||
Your task is to compare the EXPECTED extractions against the ACTUAL extractions
|
||||
produced by an AI agent, and assess quality on a 0-1 scale.
|
||||
|
||||
Scoring criteria:
|
||||
- 1.0: All expected records found with correct fields, no significant extras
|
||||
- 0.8: Most expected records found, minor field differences or extras
|
||||
- 0.6: Core extractions present but some missing or incorrect
|
||||
- 0.4: Partial match — several expected records missing or wrong
|
||||
- 0.2: Poor quality — most expected records missing or incorrect
|
||||
- 0.0: Complete failure — no meaningful overlap
|
||||
|
||||
Consider semantic equivalence: "Send invoice" and "Email the invoice" are matches.
|
||||
Ignore field ordering and formatting differences.
|
||||
|
||||
Respond with ONLY a JSON object:
|
||||
{"score": 0.85, "reasoning": "Brief explanation of the score"}
|
||||
"""
|
||||
|
||||
|
||||
async def llm_judge_score(
|
||||
expected: list[dict[str, Any]],
|
||||
actual: list[dict[str, Any]],
|
||||
*,
|
||||
judge_model: str = "gpt-4o-mini",
|
||||
) -> tuple[float, str]:
|
||||
"""Use an LLM to semantically evaluate extraction quality.
|
||||
|
||||
Returns (score, reasoning).
|
||||
"""
|
||||
from app.llm import get_llm
|
||||
|
||||
llm = get_llm(model=judge_model, temperature=0)
|
||||
|
||||
user_content = (
|
||||
f"## Expected extractions\n```json\n{json.dumps(expected, indent=2, default=str)}\n```\n\n"
|
||||
f"## Actual extractions\n```json\n{json.dumps(actual, indent=2, default=str)}\n```"
|
||||
)
|
||||
|
||||
try:
|
||||
response = await llm.ainvoke([
|
||||
SystemMessage(content=_JUDGE_SYSTEM_PROMPT),
|
||||
HumanMessage(content=user_content),
|
||||
])
|
||||
raw = response.content.strip()
|
||||
if raw.startswith("```"):
|
||||
raw = raw.split("```")[1]
|
||||
if raw.startswith("json"):
|
||||
raw = raw[4:]
|
||||
parsed = json.loads(raw.strip())
|
||||
return float(parsed.get("score", 0.0)), str(parsed.get("reasoning", ""))
|
||||
except Exception as exc:
|
||||
logger.warning("eval: LLM judge failed: %s", exc)
|
||||
return 0.0, f"Judge error: {exc}"
|
||||
Reference in New Issue
Block a user