feat(batch-agent): add E2E evaluation harness with Langfuse integration
- eval/mock_executor.py: intercepts execute_on_client, serves fixture files from disk, records all mutations (insert/update/delete) - eval/config.py: YAML fixture loader with prompt variants, expected results, seed records, model overrides - eval/scorer.py: FieldMatchScorer (fuzzy title match, per-field accuracy, precision/recall/F1) + LLMJudgeScorer (semantic eval) - eval/langfuse_eval.py: sync fixtures to Langfuse datasets, create dataset runs, post scores, link traces to runs - eval/runner.py: orchestrates fixture → mock → agent pipeline → scoring → Langfuse reporting - eval/cli.py: CLI (python -m eval run/list/sync) with --models, --variants, --fixture, --no-judge flags - eval/fixtures/: example Italian freelance scenario with 3 prompt variants (baseline, detailed_italian, minimal)
This commit is contained in:
182
services/batch-agent/eval/cli.py
Normal file
182
services/batch-agent/eval/cli.py
Normal file
@@ -0,0 +1,182 @@
|
||||
"""CLI entry point for the batch agent evaluation harness.
|
||||
|
||||
Usage::
|
||||
|
||||
# From services/batch-agent/:
|
||||
python -m eval run # all fixtures, default model
|
||||
python -m eval run --fixture=freelance-invoices # single fixture
|
||||
python -m eval run --models=gpt-4o,anthropic/claude-sonnet-4
|
||||
python -m eval run --variants=baseline,detailed # specific prompt variants
|
||||
python -m eval run --no-judge # skip LLM judge scoring
|
||||
|
||||
python -m eval list # list available fixtures
|
||||
python -m eval sync # sync fixtures to Langfuse datasets
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Ensure the service root and repo root are in sys.path
|
||||
_SERVICE_ROOT = Path(__file__).resolve().parent.parent
|
||||
_REPO_ROOT = _SERVICE_ROOT.parent.parent
|
||||
for p in (_SERVICE_ROOT, _REPO_ROOT):
|
||||
if str(p) not in sys.path:
|
||||
sys.path.insert(0, str(p))
|
||||
|
||||
from eval.config import discover_fixtures
|
||||
from eval.runner import run_fixture_eval, print_results
|
||||
from eval import langfuse_eval
|
||||
|
||||
|
||||
def _setup_logging(verbose: bool) -> None:
|
||||
level = logging.DEBUG if verbose else logging.INFO
|
||||
logging.basicConfig(
|
||||
level=level,
|
||||
format="%(asctime)s %(name)-20s %(levelname)-5s %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
# Quiet noisy libraries
|
||||
for name in ("httpx", "httpcore", "openai", "litellm", "urllib3"):
|
||||
logging.getLogger(name).setLevel(logging.WARNING)
|
||||
|
||||
|
||||
def _parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Batch Agent E2E evaluation harness",
|
||||
prog="python -m eval",
|
||||
)
|
||||
sub = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
# ── run ───────────────────────────────────────────────────────
|
||||
run_cmd = sub.add_parser("run", help="Run evaluations")
|
||||
run_cmd.add_argument(
|
||||
"--fixture", "-f",
|
||||
help="Run only the named fixture (default: all)",
|
||||
)
|
||||
run_cmd.add_argument(
|
||||
"--models", "-m",
|
||||
default="gpt-4o",
|
||||
help="Comma-separated list of models to test (default: gpt-4o)",
|
||||
)
|
||||
run_cmd.add_argument(
|
||||
"--variants", "-p",
|
||||
default=None,
|
||||
help="Comma-separated prompt variants to test (default: all in fixture)",
|
||||
)
|
||||
run_cmd.add_argument(
|
||||
"--no-judge",
|
||||
action="store_true",
|
||||
help="Skip LLM-as-judge scoring",
|
||||
)
|
||||
run_cmd.add_argument(
|
||||
"--judge-model",
|
||||
default="gpt-4o-mini",
|
||||
help="Model for LLM judge (default: gpt-4o-mini)",
|
||||
)
|
||||
run_cmd.add_argument(
|
||||
"--fixtures-dir",
|
||||
default=None,
|
||||
help="Path to fixtures directory (default: eval/fixtures/)",
|
||||
)
|
||||
run_cmd.add_argument("-v", "--verbose", action="store_true")
|
||||
|
||||
# ── list ──────────────────────────────────────────────────────
|
||||
list_cmd = sub.add_parser("list", help="List available fixtures")
|
||||
list_cmd.add_argument("--fixtures-dir", default=None)
|
||||
list_cmd.add_argument("-v", "--verbose", action="store_true")
|
||||
|
||||
# ── sync ──────────────────────────────────────────────────────
|
||||
sync_cmd = sub.add_parser("sync", help="Sync fixtures to Langfuse datasets")
|
||||
sync_cmd.add_argument("--fixture", "-f", default=None, help="Sync only the named fixture")
|
||||
sync_cmd.add_argument("--fixtures-dir", default=None)
|
||||
sync_cmd.add_argument("-v", "--verbose", action="store_true")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def _fixtures_dir(arg: str | None) -> Path | None:
|
||||
if arg:
|
||||
return Path(arg)
|
||||
return None
|
||||
|
||||
|
||||
async def _cmd_run(args: argparse.Namespace) -> None:
|
||||
fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir))
|
||||
if not fixtures:
|
||||
print("No fixtures found. Create YAML files in eval/fixtures/.")
|
||||
return
|
||||
|
||||
if args.fixture:
|
||||
fixtures = [f for f in fixtures if f.name == args.fixture]
|
||||
if not fixtures:
|
||||
print(f"Fixture '{args.fixture}' not found.")
|
||||
return
|
||||
|
||||
models = [m.strip() for m in args.models.split(",")]
|
||||
variants = [v.strip() for v in args.variants.split(",")] if args.variants else None
|
||||
|
||||
all_results = []
|
||||
for fixture in fixtures:
|
||||
results = await run_fixture_eval(
|
||||
fixture,
|
||||
models=models,
|
||||
variants=variants,
|
||||
use_llm_judge=not args.no_judge,
|
||||
judge_model=args.judge_model,
|
||||
)
|
||||
all_results.extend(results)
|
||||
|
||||
print_results(all_results)
|
||||
|
||||
|
||||
def _cmd_list(args: argparse.Namespace) -> None:
|
||||
fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir))
|
||||
if not fixtures:
|
||||
print("No fixtures found.")
|
||||
return
|
||||
|
||||
print(f"\n{'Name':<30} {'Types':<25} {'Variants':<20} {'Expected'}")
|
||||
print("-" * 90)
|
||||
for f in fixtures:
|
||||
variants = ", ".join(f.prompt_variants.keys())
|
||||
types = ", ".join(f.data_types)
|
||||
print(f"{f.name:<30} {types:<25} {variants:<20} {len(f.expected)}")
|
||||
print()
|
||||
|
||||
|
||||
def _cmd_sync(args: argparse.Namespace) -> None:
|
||||
fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir))
|
||||
if args.fixture:
|
||||
fixtures = [f for f in fixtures if f.name == args.fixture]
|
||||
|
||||
if not fixtures:
|
||||
print("No fixtures to sync.")
|
||||
return
|
||||
|
||||
for fixture in fixtures:
|
||||
name = langfuse_eval.sync_fixture_to_dataset(fixture)
|
||||
if name:
|
||||
print(f"Synced: {fixture.name} → {name}")
|
||||
else:
|
||||
print(f"Skipped: {fixture.name} (Langfuse not configured)")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = _parse_args()
|
||||
_setup_logging(args.verbose)
|
||||
|
||||
if args.command == "run":
|
||||
asyncio.run(_cmd_run(args))
|
||||
elif args.command == "list":
|
||||
_cmd_list(args)
|
||||
elif args.command == "sync":
|
||||
_cmd_sync(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user