api/services/batch-agent/eval/scorer.py

"""Scoring functions for batch agent evaluation.

Two scoring strategies:

1. **FieldMatchScorer** — deterministic check: for each expected record,
   find the best-matching actual record and compare specified fields.
   Returns precision, recall, and per-field accuracy.

2. **LLMJudgeScorer** — uses a secondary LLM to semantically evaluate
   whether the actual extractions satisfy the expected intent, even if
   wording differs.  Returns a 0-1 score + reasoning.
"""

from __future__ import annotations

import json
import logging
from dataclasses import dataclass, field
from difflib import SequenceMatcher
from typing import Any

from langchain_core.messages import HumanMessage, SystemMessage

logger = logging.getLogger(__name__)


# ── Result types ─────────────────────────────────────────────────────────


@dataclass
class FieldScore:
    """Score for a single expected record against its best match."""

    expected: dict[str, Any]
    best_match: dict[str, Any] | None
    matched_fields: dict[str, bool]
    similarity: float  # 0-1 overall similarity

    @property
    def field_accuracy(self) -> float:
        if not self.matched_fields:
            return 0.0
        return sum(self.matched_fields.values()) / len(self.matched_fields)


@dataclass
class EvalScores:
    """Aggregated scores for one eval run."""

    fixture_name: str
    model: str
    prompt_variant: str
    field_scores: list[FieldScore] = field(default_factory=list)
    precision: float = 0.0
    recall: float = 0.0
    f1: float = 0.0
    llm_judge_score: float | None = None
    llm_judge_reasoning: str = ""
    extra_records: int = 0  # records created but not expected
    missing_records: int = 0  # expected but not found

    @property
    def field_accuracy(self) -> float:
        if not self.field_scores:
            return 0.0
        return sum(s.field_accuracy for s in self.field_scores) / len(self.field_scores)

    def summary(self) -> dict[str, Any]:
        return {
            "fixture": self.fixture_name,
            "model": self.model,
            "prompt_variant": self.prompt_variant,
            "precision": round(self.precision, 3),
            "recall": round(self.recall, 3),
            "f1": round(self.f1, 3),
            "field_accuracy": round(self.field_accuracy, 3),
            "llm_judge_score": round(self.llm_judge_score, 3) if self.llm_judge_score is not None else None,
            "extra_records": self.extra_records,
            "missing_records": self.missing_records,
        }


# ── Field Match Scorer ───────────────────────────────────────────────────


def _normalize(value: Any) -> str:
    """Normalize a value for comparison."""
    if value is None:
        return ""
    return str(value).strip().lower()


def _text_similarity(a: str, b: str) -> float:
    """Fuzzy text similarity using SequenceMatcher."""
    if not a and not b:
        return 1.0
    if not a or not b:
        return 0.0
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()


def _find_best_match(
    expected: dict[str, Any],
    actuals: list[dict[str, Any]],
) -> tuple[dict[str, Any] | None, float]:
    """Find the actual record most similar to expected, return (match, similarity)."""
    if not actuals:
        return None, 0.0

    best_match = None
    best_score = 0.0

    # Primary matching key: title or name
    expected_title = _normalize(expected.get("title", expected.get("name", "")))

    for actual in actuals:
        actual_title = _normalize(actual.get("title", actual.get("name", "")))
        sim = _text_similarity(expected_title, actual_title)
        if sim > best_score:
            best_score = sim
            best_match = actual

    return best_match, best_score


def _compare_fields(
    expected: dict[str, Any],
    actual: dict[str, Any],
) -> dict[str, bool]:
    """Compare each expected field against the actual record."""
    results: dict[str, bool] = {}
    for key, expected_val in expected.items():
        actual_val = actual.get(key)
        # Exact match for non-string types
        if not isinstance(expected_val, str):
            results[key] = actual_val == expected_val
        else:
            # Fuzzy match for strings (threshold: 0.7)
            results[key] = _text_similarity(
                _normalize(expected_val), _normalize(actual_val)
            ) >= 0.7
    return results


def score_field_match(
    expected_records: list[dict[str, Any]],
    actual_records: list[dict[str, Any]],
    table: str,
) -> tuple[list[FieldScore], int, int]:
    """Score actual extractions against expected records for one table.

    Returns (field_scores, extra_count, missing_count).
    """
    field_scores: list[FieldScore] = []
    matched_actuals: set[int] = set()

    for exp in expected_records:
        # Find best match among unmatched actuals
        candidates = [
            (i, a) for i, a in enumerate(actual_records) if i not in matched_actuals
        ]
        if not candidates:
            field_scores.append(FieldScore(
                expected=exp, best_match=None, matched_fields={}, similarity=0.0,
            ))
            continue

        best_idx, best_match = None, None
        best_sim = 0.0
        for idx, actual in candidates:
            _, sim = _find_best_match(exp, [actual])
            if sim > best_sim:
                best_sim = sim
                best_idx = idx
                best_match = actual

        if best_sim >= 0.5 and best_match is not None:
            matched_actuals.add(best_idx)
            matched_fields = _compare_fields(exp, best_match)
            field_scores.append(FieldScore(
                expected=exp, best_match=best_match,
                matched_fields=matched_fields, similarity=best_sim,
            ))
        else:
            field_scores.append(FieldScore(
                expected=exp, best_match=None, matched_fields={}, similarity=0.0,
            ))

    extra_count = len(actual_records) - len(matched_actuals)
    missing_count = sum(1 for s in field_scores if s.best_match is None)

    return field_scores, extra_count, missing_count


def compute_precision_recall(
    expected_count: int,
    actual_count: int,
    matched_count: int,
) -> tuple[float, float, float]:
    """Compute precision, recall, F1."""
    precision = matched_count / actual_count if actual_count > 0 else 0.0
    recall = matched_count / expected_count if expected_count > 0 else 0.0
    f1 = (
        2 * precision * recall / (precision + recall)
        if (precision + recall) > 0
        else 0.0
    )
    return precision, recall, f1


# ── LLM Judge Scorer ─────────────────────────────────────────────────────

_JUDGE_SYSTEM_PROMPT = """\
You are an evaluation judge for a data extraction system.

Your task is to compare the EXPECTED extractions against the ACTUAL extractions
produced by an AI agent, and assess quality on a 0-1 scale.

Scoring criteria:
- 1.0: All expected records found with correct fields, no significant extras
- 0.8: Most expected records found, minor field differences or extras
- 0.6: Core extractions present but some missing or incorrect
- 0.4: Partial match — several expected records missing or wrong
- 0.2: Poor quality — most expected records missing or incorrect
- 0.0: Complete failure — no meaningful overlap

Consider semantic equivalence: "Send invoice" and "Email the invoice" are matches.
Ignore field ordering and formatting differences.

Respond with ONLY a JSON object:
{"score": 0.85, "reasoning": "Brief explanation of the score"}
"""


async def llm_judge_score(
    expected: list[dict[str, Any]],
    actual: list[dict[str, Any]],
    *,
    judge_model: str = "gpt-4o-mini",
) -> tuple[float, str]:
    """Use an LLM to semantically evaluate extraction quality.

    Returns (score, reasoning).
    """
    from shared.llm import get_llm

    llm = get_llm(model=judge_model, temperature=0)

    user_content = (
        f"## Expected extractions\n```json\n{json.dumps(expected, indent=2, default=str)}\n```\n\n"
        f"## Actual extractions\n```json\n{json.dumps(actual, indent=2, default=str)}\n```"
    )

    try:
        response = await llm.ainvoke([
            SystemMessage(content=_JUDGE_SYSTEM_PROMPT),
            HumanMessage(content=user_content),
        ])
        raw = response.content.strip()
        if raw.startswith("```"):
            raw = raw.split("```")[1]
            if raw.startswith("json"):
                raw = raw[4:]
        parsed = json.loads(raw.strip())
        return float(parsed.get("score", 0.0)), str(parsed.get("reasoning", ""))
    except Exception as exc:
        logger.warning("eval: LLM judge failed: %s", exc)
        return 0.0, f"Judge error: {exc}"