feat(batch-agent): add E2E evaluation harness with Langfuse integration

- eval/mock_executor.py: intercepts execute_on_client, serves fixture files from disk, records all mutations (insert/update/delete) - eval/config.py: YAML fixture loader with prompt variants, expected results, seed records, model overrides - eval/scorer.py: FieldMatchScorer (fuzzy title match, per-field accuracy, precision/recall/F1) + LLMJudgeScorer (semantic eval) - eval/langfuse_eval.py: sync fixtures to Langfuse datasets, create dataset runs, post scores, link traces to runs - eval/runner.py: orchestrates fixture → mock → agent pipeline → scoring → Langfuse reporting - eval/cli.py: CLI (python -m eval run/list/sync) with --models, --variants, --fixture, --no-judge flags - eval/fixtures/: example Italian freelance scenario with 3 prompt variants (baseline, detailed_italian, minimal)
2026-03-23 08:54:19 +01:00
parent 971f1dd84f
commit 75a826c9d8
12 changed files with 1382 additions and 0 deletions
--- a/services/batch-agent/eval/scorer.py
+++ b/services/batch-agent/eval/scorer.py
@@ -0,0 +1,268 @@
+"""Scoring functions for batch agent evaluation.
+
+Two scoring strategies:
+
+1. **FieldMatchScorer** — deterministic check: for each expected record,
+   find the best-matching actual record and compare specified fields.
+   Returns precision, recall, and per-field accuracy.
+
+2. **LLMJudgeScorer** — uses a secondary LLM to semantically evaluate
+   whether the actual extractions satisfy the expected intent, even if
+   wording differs.  Returns a 0-1 score + reasoning.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from dataclasses import dataclass, field
+from difflib import SequenceMatcher
+from typing import Any
+
+from langchain_core.messages import HumanMessage, SystemMessage
+
+logger = logging.getLogger(__name__)
+
+
+# ── Result types ─────────────────────────────────────────────────────────
+
+
+@dataclass
+class FieldScore:
+    """Score for a single expected record against its best match."""
+
+    expected: dict[str, Any]
+    best_match: dict[str, Any] | None
+    matched_fields: dict[str, bool]
+    similarity: float  # 0-1 overall similarity
+
+    @property
+    def field_accuracy(self) -> float:
+        if not self.matched_fields:
+            return 0.0
+        return sum(self.matched_fields.values()) / len(self.matched_fields)
+
+
+@dataclass
+class EvalScores:
+    """Aggregated scores for one eval run."""
+
+    fixture_name: str
+    model: str
+    prompt_variant: str
+    field_scores: list[FieldScore] = field(default_factory=list)
+    precision: float = 0.0
+    recall: float = 0.0
+    f1: float = 0.0
+    llm_judge_score: float | None = None
+    llm_judge_reasoning: str = ""
+    extra_records: int = 0  # records created but not expected
+    missing_records: int = 0  # expected but not found
+
+    @property
+    def field_accuracy(self) -> float:
+        if not self.field_scores:
+            return 0.0
+        return sum(s.field_accuracy for s in self.field_scores) / len(self.field_scores)
+
+    def summary(self) -> dict[str, Any]:
+        return {
+            "fixture": self.fixture_name,
+            "model": self.model,
+            "prompt_variant": self.prompt_variant,
+            "precision": round(self.precision, 3),
+            "recall": round(self.recall, 3),
+            "f1": round(self.f1, 3),
+            "field_accuracy": round(self.field_accuracy, 3),
+            "llm_judge_score": round(self.llm_judge_score, 3) if self.llm_judge_score is not None else None,
+            "extra_records": self.extra_records,
+            "missing_records": self.missing_records,
+        }
+
+
+# ── Field Match Scorer ───────────────────────────────────────────────────
+
+
+def _normalize(value: Any) -> str:
+    """Normalize a value for comparison."""
+    if value is None:
+        return ""
+    return str(value).strip().lower()
+
+
+def _text_similarity(a: str, b: str) -> float:
+    """Fuzzy text similarity using SequenceMatcher."""
+    if not a and not b:
+        return 1.0
+    if not a or not b:
+        return 0.0
+    return SequenceMatcher(None, a.lower(), b.lower()).ratio()
+
+
+def _find_best_match(
+    expected: dict[str, Any],
+    actuals: list[dict[str, Any]],
+) -> tuple[dict[str, Any] | None, float]:
+    """Find the actual record most similar to expected, return (match, similarity)."""
+    if not actuals:
+        return None, 0.0
+
+    best_match = None
+    best_score = 0.0
+
+    # Primary matching key: title or name
+    expected_title = _normalize(expected.get("title", expected.get("name", "")))
+
+    for actual in actuals:
+        actual_title = _normalize(actual.get("title", actual.get("name", "")))
+        sim = _text_similarity(expected_title, actual_title)
+        if sim > best_score:
+            best_score = sim
+            best_match = actual
+
+    return best_match, best_score
+
+
+def _compare_fields(
+    expected: dict[str, Any],
+    actual: dict[str, Any],
+) -> dict[str, bool]:
+    """Compare each expected field against the actual record."""
+    results: dict[str, bool] = {}
+    for key, expected_val in expected.items():
+        actual_val = actual.get(key)
+        # Exact match for non-string types
+        if not isinstance(expected_val, str):
+            results[key] = actual_val == expected_val
+        else:
+            # Fuzzy match for strings (threshold: 0.7)
+            results[key] = _text_similarity(
+                _normalize(expected_val), _normalize(actual_val)
+            ) >= 0.7
+    return results
+
+
+def score_field_match(
+    expected_records: list[dict[str, Any]],
+    actual_records: list[dict[str, Any]],
+    table: str,
+) -> tuple[list[FieldScore], int, int]:
+    """Score actual extractions against expected records for one table.
+
+    Returns (field_scores, extra_count, missing_count).
+    """
+    field_scores: list[FieldScore] = []
+    matched_actuals: set[int] = set()
+
+    for exp in expected_records:
+        # Find best match among unmatched actuals
+        candidates = [
+            (i, a) for i, a in enumerate(actual_records) if i not in matched_actuals
+        ]
+        if not candidates:
+            field_scores.append(FieldScore(
+                expected=exp, best_match=None, matched_fields={}, similarity=0.0,
+            ))
+            continue
+
+        best_idx, best_match = None, None
+        best_sim = 0.0
+        for idx, actual in candidates:
+            _, sim = _find_best_match(exp, [actual])
+            if sim > best_sim:
+                best_sim = sim
+                best_idx = idx
+                best_match = actual
+
+        if best_sim >= 0.5 and best_match is not None:
+            matched_actuals.add(best_idx)
+            matched_fields = _compare_fields(exp, best_match)
+            field_scores.append(FieldScore(
+                expected=exp, best_match=best_match,
+                matched_fields=matched_fields, similarity=best_sim,
+            ))
+        else:
+            field_scores.append(FieldScore(
+                expected=exp, best_match=None, matched_fields={}, similarity=0.0,
+            ))
+
+    extra_count = len(actual_records) - len(matched_actuals)
+    missing_count = sum(1 for s in field_scores if s.best_match is None)
+
+    return field_scores, extra_count, missing_count
+
+
+def compute_precision_recall(
+    expected_count: int,
+    actual_count: int,
+    matched_count: int,
+) -> tuple[float, float, float]:
+    """Compute precision, recall, F1."""
+    precision = matched_count / actual_count if actual_count > 0 else 0.0
+    recall = matched_count / expected_count if expected_count > 0 else 0.0
+    f1 = (
+        2 * precision * recall / (precision + recall)
+        if (precision + recall) > 0
+        else 0.0
+    )
+    return precision, recall, f1
+
+
+# ── LLM Judge Scorer ─────────────────────────────────────────────────────
+
+_JUDGE_SYSTEM_PROMPT = """\
+You are an evaluation judge for a data extraction system.
+
+Your task is to compare the EXPECTED extractions against the ACTUAL extractions
+produced by an AI agent, and assess quality on a 0-1 scale.
+
+Scoring criteria:
+- 1.0: All expected records found with correct fields, no significant extras
+- 0.8: Most expected records found, minor field differences or extras
+- 0.6: Core extractions present but some missing or incorrect
+- 0.4: Partial match — several expected records missing or wrong
+- 0.2: Poor quality — most expected records missing or incorrect
+- 0.0: Complete failure — no meaningful overlap
+
+Consider semantic equivalence: "Send invoice" and "Email the invoice" are matches.
+Ignore field ordering and formatting differences.
+
+Respond with ONLY a JSON object:
+{"score": 0.85, "reasoning": "Brief explanation of the score"}
+"""
+
+
+async def llm_judge_score(
+    expected: list[dict[str, Any]],
+    actual: list[dict[str, Any]],
+    *,
+    judge_model: str = "gpt-4o-mini",
+) -> tuple[float, str]:
+    """Use an LLM to semantically evaluate extraction quality.
+
+    Returns (score, reasoning).
+    """
+    from app.llm import get_llm
+
+    llm = get_llm(model=judge_model, temperature=0)
+
+    user_content = (
+        f"## Expected extractions\n```json\n{json.dumps(expected, indent=2, default=str)}\n```\n\n"
+        f"## Actual extractions\n```json\n{json.dumps(actual, indent=2, default=str)}\n```"
+    )
+
+    try:
+        response = await llm.ainvoke([
+            SystemMessage(content=_JUDGE_SYSTEM_PROMPT),
+            HumanMessage(content=user_content),
+        ])
+        raw = response.content.strip()
+        if raw.startswith("```"):
+            raw = raw.split("```")[1]
+            if raw.startswith("json"):
+                raw = raw[4:]
+        parsed = json.loads(raw.strip())
+        return float(parsed.get("score", 0.0)), str(parsed.get("reasoning", ""))
+    except Exception as exc:
+        logger.warning("eval: LLM judge failed: %s", exc)
+        return 0.0, f"Judge error: {exc}"