feat(batch-agent): add E2E evaluation harness with Langfuse integration

- eval/mock_executor.py: intercepts execute_on_client, serves fixture files from disk, records all mutations (insert/update/delete) - eval/config.py: YAML fixture loader with prompt variants, expected results, seed records, model overrides - eval/scorer.py: FieldMatchScorer (fuzzy title match, per-field accuracy, precision/recall/F1) + LLMJudgeScorer (semantic eval) - eval/langfuse_eval.py: sync fixtures to Langfuse datasets, create dataset runs, post scores, link traces to runs - eval/runner.py: orchestrates fixture → mock → agent pipeline → scoring → Langfuse reporting - eval/cli.py: CLI (python -m eval run/list/sync) with --models, --variants, --fixture, --no-judge flags - eval/fixtures/: example Italian freelance scenario with 3 prompt variants (baseline, detailed_italian, minimal)
2026-03-23 08:54:19 +01:00
parent 971f1dd84f
commit 75a826c9d8
12 changed files with 1382 additions and 0 deletions
--- a/services/batch-agent/eval/runner.py
+++ b/services/batch-agent/eval/runner.py
@@ -0,0 +1,236 @@
+"""Eval runner — orchestrates fixture → mock → agent pipeline → scoring.
+
+For each (fixture × model × prompt_variant) combination:
+1. Build a MockExecutor with fixture data
+2. Patch execute_on_client
+3. Override LLM_MODEL in shared settings
+4. Run the batch agent pipeline (run_local_agent)
+5. Collect mutations from the mock
+6. Score against expected results (field match + optional LLM judge)
+7. Report scores to Langfuse
+8. Print results
+"""
+
+from __future__ import annotations
+
+import asyncio
+import copy
+import json
+import logging
+import time
+import uuid
+from pathlib import Path
+from typing import Any
+
+from eval.config import EvalFixture, ExpectedRecord
+from eval.mock_executor import MockExecutor
+from eval.scorer import (
+    EvalScores,
+    FieldScore,
+    compute_precision_recall,
+    llm_judge_score,
+    score_field_match,
+)
+from eval import langfuse_eval
+
+logger = logging.getLogger(__name__)
+
+
+async def run_single_eval(
+    fixture: EvalFixture,
+    model: str,
+    prompt_variant: str,
+    *,
+    use_llm_judge: bool = True,
+    judge_model: str = "gpt-4o-mini",
+) -> EvalScores:
+    """Execute one (fixture × model × prompt_variant) eval and return scores."""
+    from shared.config import settings
+
+    prompt_template = fixture.prompt_variants.get(prompt_variant, "")
+
+    # Build mock executor
+    seed = copy.deepcopy(fixture.seed_records)
+    mock = MockExecutor(
+        fixture_dir=fixture.fixture_dir,
+        seed_records=seed,
+    )
+
+    # Override the LLM model for this run
+    original_model = settings.LLM_MODEL
+    settings.LLM_MODEL = model
+
+    # Build trigger data (same shape as what redis_consumer delivers)
+    trigger_data: dict[str, Any] = {
+        "type": "agent_trigger",
+        "directory": fixture.directory,
+        "directory_paths": [fixture.directory],
+        "data_types": fixture.data_types,
+        "file_extensions": fixture.file_extensions,
+        "prompt_template": prompt_template,
+        "device_id": "eval-harness",
+        "run_context": {
+            "agent_id": f"eval-{fixture.name}-{prompt_variant}",
+            "run_id": None,  # skip DB logging during eval
+        },
+    }
+
+    eval_user_id = f"eval-{uuid.uuid4().hex[:8]}"
+
+    logger.info(
+        "eval: starting %s | model=%s | variant=%s",
+        fixture.name, model, prompt_variant,
+    )
+    start_time = time.time()
+
+    try:
+        # Patch execute_on_client + set user context, then run the pipeline
+        from app.ws_context import set_current_user, clear_current_user
+        from app.agent_runner import run_local_agent
+
+        set_current_user(eval_user_id)
+        with mock.patch():
+            await run_local_agent(eval_user_id, trigger_data)
+    except Exception as exc:
+        logger.error("eval: pipeline failed for %s/%s/%s: %s", fixture.name, model, prompt_variant, exc)
+    finally:
+        settings.LLM_MODEL = original_model
+        from app.ws_context import clear_current_user
+        clear_current_user()
+
+    elapsed = time.time() - start_time
+    logger.info("eval: pipeline completed in %.1fs — %d mutations", elapsed, len(mock.mutations))
+
+    # ── Score results ────────────────────────────────────────────
+    all_field_scores: list[FieldScore] = []
+    total_expected = 0
+    total_actual = 0
+    total_matched = 0
+    total_extra = 0
+    total_missing = 0
+
+    # Group expected by table
+    expected_by_table: dict[str, list[dict]] = {}
+    for rec in fixture.expected:
+        expected_by_table.setdefault(rec.table, []).append(rec.fields)
+
+    # Compare against actual mutations (inserts + updates)
+    tables = set(expected_by_table.keys()) | {m.table for m in mock.mutations}
+    for table in tables:
+        expected_records = expected_by_table.get(table, [])
+        actual_records = mock.created_records(table) + mock.updated_records(table)
+
+        field_scores, extra, missing = score_field_match(expected_records, actual_records, table)
+        all_field_scores.extend(field_scores)
+
+        matched = sum(1 for s in field_scores if s.best_match is not None)
+        total_expected += len(expected_records)
+        total_actual += len(actual_records)
+        total_matched += matched
+        total_extra += extra
+        total_missing += missing
+
+    precision, recall, f1 = compute_precision_recall(total_expected, total_actual, total_matched)
+
+    scores = EvalScores(
+        fixture_name=fixture.name,
+        model=model,
+        prompt_variant=prompt_variant,
+        field_scores=all_field_scores,
+        precision=precision,
+        recall=recall,
+        f1=f1,
+        extra_records=total_extra,
+        missing_records=total_missing,
+    )
+
+    # ── Optional LLM judge ───────────────────────────────────────
+    if use_llm_judge and fixture.expected:
+        all_expected = [r.fields for r in fixture.expected]
+        all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")]
+        judge_score, reasoning = await llm_judge_score(
+            all_expected, all_actual, judge_model=judge_model,
+        )
+        scores.llm_judge_score = judge_score
+        scores.llm_judge_reasoning = reasoning
+
+    # ── Report to Langfuse ───────────────────────────────────────
+    dataset_name = f"batch-eval-{fixture.name}"
+    dataset_item_id = f"{fixture.name}--{prompt_variant}"
+    run_name = f"{model}--{prompt_variant}--{int(time.time())}"
+
+    trace_id = langfuse_eval.log_eval_trace(
+        fixture_name=fixture.name,
+        model=model,
+        prompt_variant=prompt_variant,
+        prompt_template=prompt_template,
+        actual_mutations=[{"action": m.action, "table": m.table, "data": m.data} for m in mock.mutations],
+        scores_summary=scores.summary(),
+        dataset_name=dataset_name,
+        run_name=run_name,
+        dataset_item_id=dataset_item_id,
+    )
+
+    if trace_id:
+        langfuse_eval.post_eval_scores(scores, trace_id=trace_id)
+
+    return scores
+
+
+async def run_fixture_eval(
+    fixture: EvalFixture,
+    models: list[str],
+    *,
+    variants: list[str] | None = None,
+    use_llm_judge: bool = True,
+    judge_model: str = "gpt-4o-mini",
+) -> list[EvalScores]:
+    """Run all (model × variant) combinations for a fixture."""
+    if variants is None:
+        variants = list(fixture.prompt_variants.keys())
+
+    # Sync fixture to Langfuse dataset
+    langfuse_eval.sync_fixture_to_dataset(fixture)
+
+    results: list[EvalScores] = []
+    for model in models:
+        for variant in variants:
+            if variant not in fixture.prompt_variants:
+                logger.warning("eval: variant %r not found in fixture %s", variant, fixture.name)
+                continue
+            scores = await run_single_eval(
+                fixture, model, variant,
+                use_llm_judge=use_llm_judge,
+                judge_model=judge_model,
+            )
+            results.append(scores)
+
+    return results
+
+
+def print_results(results: list[EvalScores]) -> None:
+    """Print a formatted summary table of eval results."""
+    if not results:
+        print("\nNo eval results.")
+        return
+
+    print("\n" + "=" * 90)
+    print(f"{'Fixture':<25} {'Model':<25} {'Variant':<15} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}")
+    print("-" * 90)
+
+    for s in results:
+        llm_str = f"{s.llm_judge_score:.2f}" if s.llm_judge_score is not None else "  --"
+        print(
+            f"{s.fixture_name:<25} {s.model:<25} {s.prompt_variant:<15} "
+            f"{s.precision:>6.2f} {s.recall:>6.2f} {s.f1:>6.2f} "
+            f"{s.field_accuracy:>6.2f} {llm_str:>6}"
+        )
+
+    print("=" * 90)
+
+    # If LLM judge reasoning is available, print it
+    for s in results:
+        if s.llm_judge_reasoning:
+            print(f"\n[{s.model} / {s.prompt_variant}] LLM Judge: {s.llm_judge_reasoning}")
+
+    print()