"""CLI entry point for the batch agent evaluation harness. Usage:: # From services/batch-agent/: python -m eval run # all agent fixtures, default model python -m eval run --fixture=freelance-invoices # single fixture python -m eval run --models=gpt-4o,anthropic/claude-sonnet-4 python -m eval run --variants=baseline,detailed # specific prompt variants python -m eval run --no-judge # skip LLM judge scoring python -m eval journey # all journey fixtures python -m eval journey --fixture=journey-invoices # single journey fixture python -m eval journey --models=gpt-4o,anthropic/claude-sonnet-4 python -m eval list # list all fixtures python -m eval sync # sync fixtures to Langfuse datasets """ from __future__ import annotations import argparse import asyncio import logging import sys from pathlib import Path # Ensure the service root and repo root are in sys.path _SERVICE_ROOT = Path(__file__).resolve().parent.parent _REPO_ROOT = _SERVICE_ROOT.parent.parent for p in (_SERVICE_ROOT, _REPO_ROOT): if str(p) not in sys.path: sys.path.insert(0, str(p)) from eval.config import discover_fixtures, discover_journey_fixtures from eval.runner import run_fixture_eval, print_results from eval.journey_runner import run_journey_fixture_eval, print_journey_results from eval import langfuse_eval def _setup_logging(verbose: bool) -> None: level = logging.DEBUG if verbose else logging.INFO logging.basicConfig( level=level, format="%(asctime)s %(name)-20s %(levelname)-5s %(message)s", datefmt="%H:%M:%S", ) # Quiet noisy libraries for name in ("httpx", "httpcore", "openai", "litellm", "urllib3"): logging.getLogger(name).setLevel(logging.WARNING) def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Batch Agent E2E evaluation harness", prog="python -m eval", ) sub = parser.add_subparsers(dest="command", required=True) # ── run ─────────────────────────────────────────────────────── run_cmd = sub.add_parser("run", help="Run evaluations") run_cmd.add_argument( "--fixture", "-f", help="Run only the named fixture (default: all)", ) run_cmd.add_argument( "--models", "-m", default="gpt-4o", help="Comma-separated list of models to test (default: gpt-4o)", ) run_cmd.add_argument( "--variants", "-p", default=None, help="Comma-separated prompt variants to test (default: all in fixture)", ) run_cmd.add_argument( "--no-judge", action="store_true", help="Skip LLM-as-judge scoring", ) run_cmd.add_argument( "--judge-model", default="gpt-4o-mini", help="Model for LLM judge (default: gpt-4o-mini)", ) run_cmd.add_argument( "--fixtures-dir", default=None, help="Path to fixtures directory (default: eval/fixtures/)", ) run_cmd.add_argument("-v", "--verbose", action="store_true") # ── list ────────────────────────────────────────────────────── list_cmd = sub.add_parser("list", help="List available fixtures") list_cmd.add_argument("--fixtures-dir", default=None) list_cmd.add_argument("-v", "--verbose", action="store_true") # ── journey ─────────────────────────────────────────────────── journey_cmd = sub.add_parser("journey", help="Run journey evaluations") journey_cmd.add_argument( "--fixture", "-f", help="Run only the named journey fixture (default: all)", ) journey_cmd.add_argument( "--models", "-m", default="gpt-4o", help="Comma-separated list of models to test (default: gpt-4o)", ) journey_cmd.add_argument( "--judge-model", default="gpt-4o-mini", help="Model for LLM judge (default: gpt-4o-mini)", ) journey_cmd.add_argument( "--fixtures-dir", default=None, help="Path to fixtures directory (default: eval/fixtures/)", ) journey_cmd.add_argument("-v", "--verbose", action="store_true") # ── sync ────────────────────────────────────────────────────── sync_cmd = sub.add_parser("sync", help="Sync fixtures to Langfuse datasets") sync_cmd.add_argument("--fixture", "-f", default=None, help="Sync only the named fixture") sync_cmd.add_argument("--fixtures-dir", default=None) sync_cmd.add_argument("-v", "--verbose", action="store_true") return parser.parse_args() def _fixtures_dir(arg: str | None) -> Path | None: if arg: return Path(arg) return None async def _cmd_run(args: argparse.Namespace) -> None: fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir)) if not fixtures: print("No fixtures found. Create YAML files in eval/fixtures/.") return if args.fixture: fixtures = [f for f in fixtures if f.name == args.fixture] if not fixtures: print(f"Fixture '{args.fixture}' not found.") return models = [m.strip() for m in args.models.split(",")] variants = [v.strip() for v in args.variants.split(",")] if args.variants else None all_results = [] for fixture in fixtures: results = await run_fixture_eval( fixture, models=models, variants=variants, use_llm_judge=not args.no_judge, judge_model=args.judge_model, ) all_results.extend(results) print_results(all_results) def _cmd_list(args: argparse.Namespace) -> None: fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir)) journey_fixtures = discover_journey_fixtures(_fixtures_dir(args.fixtures_dir)) if not fixtures and not journey_fixtures: print("No fixtures found.") return if fixtures: print(f"\n{'[Agent Fixtures]'}") print(f"{'Name':<30} {'Types':<25} {'Variants':<20} {'Expected'}") print("-" * 90) for f in fixtures: variants = ", ".join(f.prompt_variants.keys()) types = ", ".join(f.data_types) print(f"{f.name:<30} {types:<25} {variants:<20} {len(f.expected)}") if journey_fixtures: print(f"\n{'[Journey Fixtures]'}") print(f"{'Name':<30} {'Types':<25} {'Messages':<10} {'Criteria'}") print("-" * 90) for f in journey_fixtures: types = ", ".join(f.data_types) print(f"{f.name:<30} {types:<25} {len(f.user_messages):<10} {len(f.expected_template_criteria)}") print() def _cmd_sync(args: argparse.Namespace) -> None: fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir)) journey_fixtures = discover_journey_fixtures(_fixtures_dir(args.fixtures_dir)) if args.fixture: fixtures = [f for f in fixtures if f.name == args.fixture] journey_fixtures = [f for f in journey_fixtures if f.name == args.fixture] if not fixtures and not journey_fixtures: print("No fixtures to sync.") return for fixture in fixtures: name = langfuse_eval.sync_fixture_to_dataset(fixture) if name: print(f"Synced: {fixture.name} → {name}") else: print(f"Skipped: {fixture.name} (Langfuse not configured)") for fixture in journey_fixtures: name = langfuse_eval.sync_journey_fixture_to_dataset(fixture) if name: print(f"Synced: {fixture.name} → {name}") else: print(f"Skipped: {fixture.name} (Langfuse not configured)") async def _cmd_journey(args: argparse.Namespace) -> None: journey_fixtures = discover_journey_fixtures(_fixtures_dir(args.fixtures_dir)) if not journey_fixtures: print("No journey fixtures found. Create YAML files with type: journey in eval/fixtures/.") return if args.fixture: journey_fixtures = [f for f in journey_fixtures if f.name == args.fixture] if not journey_fixtures: print(f"Journey fixture '{args.fixture}' not found.") return models = [m.strip() for m in args.models.split(",")] all_results = [] for fixture in journey_fixtures: results = await run_journey_fixture_eval( fixture, models=models, judge_model=args.judge_model, ) all_results.extend(results) print_journey_results(all_results) def main() -> None: args = _parse_args() _setup_logging(args.verbose) if args.command == "run": asyncio.run(_cmd_run(args)) elif args.command == "journey": asyncio.run(_cmd_journey(args)) elif args.command == "list": _cmd_list(args) elif args.command == "sync": _cmd_sync(args) if __name__ == "__main__": main()