feat(batch-agent): add E2E evaluation harness with Langfuse integration

- eval/mock_executor.py: intercepts execute_on_client, serves fixture files from disk, records all mutations (insert/update/delete) - eval/config.py: YAML fixture loader with prompt variants, expected results, seed records, model overrides - eval/scorer.py: FieldMatchScorer (fuzzy title match, per-field accuracy, precision/recall/F1) + LLMJudgeScorer (semantic eval) - eval/langfuse_eval.py: sync fixtures to Langfuse datasets, create dataset runs, post scores, link traces to runs - eval/runner.py: orchestrates fixture → mock → agent pipeline → scoring → Langfuse reporting - eval/cli.py: CLI (python -m eval run/list/sync) with --models, --variants, --fixture, --no-judge flags - eval/fixtures/: example Italian freelance scenario with 3 prompt variants (baseline, detailed_italian, minimal)
2026-03-23 08:54:19 +01:00
parent 971f1dd84f
commit 75a826c9d8
12 changed files with 1382 additions and 0 deletions
--- a/services/batch-agent/eval/cli.py
+++ b/services/batch-agent/eval/cli.py
@@ -0,0 +1,182 @@
+"""CLI entry point for the batch agent evaluation harness.
+
+Usage::
+
+    # From services/batch-agent/:
+    python -m eval run                                # all fixtures, default model
+    python -m eval run --fixture=freelance-invoices   # single fixture
+    python -m eval run --models=gpt-4o,anthropic/claude-sonnet-4
+    python -m eval run --variants=baseline,detailed   # specific prompt variants
+    python -m eval run --no-judge                     # skip LLM judge scoring
+
+    python -m eval list                               # list available fixtures
+    python -m eval sync                               # sync fixtures to Langfuse datasets
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import logging
+import sys
+from pathlib import Path
+
+# Ensure the service root and repo root are in sys.path
+_SERVICE_ROOT = Path(__file__).resolve().parent.parent
+_REPO_ROOT = _SERVICE_ROOT.parent.parent
+for p in (_SERVICE_ROOT, _REPO_ROOT):
+    if str(p) not in sys.path:
+        sys.path.insert(0, str(p))
+
+from eval.config import discover_fixtures
+from eval.runner import run_fixture_eval, print_results
+from eval import langfuse_eval
+
+
+def _setup_logging(verbose: bool) -> None:
+    level = logging.DEBUG if verbose else logging.INFO
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s %(name)-20s %(levelname)-5s %(message)s",
+        datefmt="%H:%M:%S",
+    )
+    # Quiet noisy libraries
+    for name in ("httpx", "httpcore", "openai", "litellm", "urllib3"):
+        logging.getLogger(name).setLevel(logging.WARNING)
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Batch Agent E2E evaluation harness",
+        prog="python -m eval",
+    )
+    sub = parser.add_subparsers(dest="command", required=True)
+
+    # ── run ───────────────────────────────────────────────────────
+    run_cmd = sub.add_parser("run", help="Run evaluations")
+    run_cmd.add_argument(
+        "--fixture", "-f",
+        help="Run only the named fixture (default: all)",
+    )
+    run_cmd.add_argument(
+        "--models", "-m",
+        default="gpt-4o",
+        help="Comma-separated list of models to test (default: gpt-4o)",
+    )
+    run_cmd.add_argument(
+        "--variants", "-p",
+        default=None,
+        help="Comma-separated prompt variants to test (default: all in fixture)",
+    )
+    run_cmd.add_argument(
+        "--no-judge",
+        action="store_true",
+        help="Skip LLM-as-judge scoring",
+    )
+    run_cmd.add_argument(
+        "--judge-model",
+        default="gpt-4o-mini",
+        help="Model for LLM judge (default: gpt-4o-mini)",
+    )
+    run_cmd.add_argument(
+        "--fixtures-dir",
+        default=None,
+        help="Path to fixtures directory (default: eval/fixtures/)",
+    )
+    run_cmd.add_argument("-v", "--verbose", action="store_true")
+
+    # ── list ──────────────────────────────────────────────────────
+    list_cmd = sub.add_parser("list", help="List available fixtures")
+    list_cmd.add_argument("--fixtures-dir", default=None)
+    list_cmd.add_argument("-v", "--verbose", action="store_true")
+
+    # ── sync ──────────────────────────────────────────────────────
+    sync_cmd = sub.add_parser("sync", help="Sync fixtures to Langfuse datasets")
+    sync_cmd.add_argument("--fixture", "-f", default=None, help="Sync only the named fixture")
+    sync_cmd.add_argument("--fixtures-dir", default=None)
+    sync_cmd.add_argument("-v", "--verbose", action="store_true")
+
+    return parser.parse_args()
+
+
+def _fixtures_dir(arg: str | None) -> Path | None:
+    if arg:
+        return Path(arg)
+    return None
+
+
+async def _cmd_run(args: argparse.Namespace) -> None:
+    fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir))
+    if not fixtures:
+        print("No fixtures found. Create YAML files in eval/fixtures/.")
+        return
+
+    if args.fixture:
+        fixtures = [f for f in fixtures if f.name == args.fixture]
+        if not fixtures:
+            print(f"Fixture '{args.fixture}' not found.")
+            return
+
+    models = [m.strip() for m in args.models.split(",")]
+    variants = [v.strip() for v in args.variants.split(",")] if args.variants else None
+
+    all_results = []
+    for fixture in fixtures:
+        results = await run_fixture_eval(
+            fixture,
+            models=models,
+            variants=variants,
+            use_llm_judge=not args.no_judge,
+            judge_model=args.judge_model,
+        )
+        all_results.extend(results)
+
+    print_results(all_results)
+
+
+def _cmd_list(args: argparse.Namespace) -> None:
+    fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir))
+    if not fixtures:
+        print("No fixtures found.")
+        return
+
+    print(f"\n{'Name':<30} {'Types':<25} {'Variants':<20} {'Expected'}")
+    print("-" * 90)
+    for f in fixtures:
+        variants = ", ".join(f.prompt_variants.keys())
+        types = ", ".join(f.data_types)
+        print(f"{f.name:<30} {types:<25} {variants:<20} {len(f.expected)}")
+    print()
+
+
+def _cmd_sync(args: argparse.Namespace) -> None:
+    fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir))
+    if args.fixture:
+        fixtures = [f for f in fixtures if f.name == args.fixture]
+
+    if not fixtures:
+        print("No fixtures to sync.")
+        return
+
+    for fixture in fixtures:
+        name = langfuse_eval.sync_fixture_to_dataset(fixture)
+        if name:
+            print(f"Synced: {fixture.name} → {name}")
+        else:
+            print(f"Skipped: {fixture.name} (Langfuse not configured)")
+
+
+def main() -> None:
+    args = _parse_args()
+    _setup_logging(args.verbose)
+
+    if args.command == "run":
+        asyncio.run(_cmd_run(args))
+    elif args.command == "list":
+        _cmd_list(args)
+    elif args.command == "sync":
+        _cmd_sync(args)
+
+
+if __name__ == "__main__":
+    main()