"""CLI entry point for the batch agent evaluation harness. Usage:: # From services/batch-agent/: python -m eval run # all agent fixtures, default model python -m eval run --fixture=classify-invoices # single fixture python -m eval run --models=gpt-4o,gpt-5.3-codex # multiple models python -m eval run --mode=step1 # only step1 fixtures python -m eval run --no-judge # skip LLM judge scoring python -m eval interactive # interactive journey session python -m eval interactive --fixture=journey-invoice-setup python -m eval interactive --model=gpt-4o python -m eval interactive --judge-model=github_copilot/gpt-4o-mini python -m eval list # list all fixtures python -m eval sync # sync fixtures to Langfuse datasets """ from __future__ import annotations import argparse import asyncio import logging import sys from pathlib import Path # Ensure the service root and repo root are in sys.path. # Service root must come BEFORE repo root so its ``app/`` package # shadows the monolith ``app/`` in the repo root. _SERVICE_ROOT = Path(__file__).resolve().parent.parent _REPO_ROOT = _SERVICE_ROOT.parent.parent _sr = str(_SERVICE_ROOT) _rr = str(_REPO_ROOT) if _rr not in sys.path: sys.path.insert(0, _rr) # Always force service root to position 0 (python -m may have already # added CWD further down the list, which loses to repo root). if _sr in sys.path: sys.path.remove(_sr) sys.path.insert(0, _sr) from eval.config import discover_fixtures, discover_journey_fixtures from eval.runner import run_fixture_eval, print_results from eval.interactive import run_interactive from eval import langfuse_eval def _setup_logging(verbose: bool) -> None: level = logging.DEBUG if verbose else logging.INFO logging.basicConfig( level=level, format="%(asctime)s %(name)-20s %(levelname)-5s %(message)s", datefmt="%H:%M:%S", ) # Quiet noisy libraries for name in ("httpx", "httpcore", "openai", "litellm", "urllib3"): logging.getLogger(name).setLevel(logging.WARNING) def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Batch Agent E2E evaluation harness", prog="python -m eval", ) sub = parser.add_subparsers(dest="command", required=True) # ── run ─────────────────────────────────────────────────────── run_cmd = sub.add_parser("run", help="Run evaluations") run_cmd.add_argument( "--fixture", "-f", help="Run only the named fixture (default: all)", ) run_cmd.add_argument( "--models", "-m", default="github_copilot/gpt-5.3-codex", help="Comma-separated list of models to test (default: github_copilot/gpt-5.3-codex)", ) run_cmd.add_argument( "--mode", default=None, choices=["step1", "step2", "full"], help="Only run fixtures with this mode (default: all)", ) run_cmd.add_argument( "--no-judge", action="store_true", help="Skip LLM-as-judge scoring", ) run_cmd.add_argument( "--judge-model", default="gpt-4o", help="Model for LLM judge (default: gpt-4o)", ) run_cmd.add_argument( "--fixtures-dir", default=None, help="Path to fixtures directory (default: eval/fixtures/)", ) run_cmd.add_argument("-v", "--verbose", action="store_true") # ── list ────────────────────────────────────────────────────── list_cmd = sub.add_parser("list", help="List available fixtures") list_cmd.add_argument("--fixtures-dir", default=None) list_cmd.add_argument("-v", "--verbose", action="store_true") # ── sync ────────────────────────────────────────────────────── sync_cmd = sub.add_parser("sync", help="Sync fixtures to Langfuse datasets") sync_cmd.add_argument("--fixture", "-f", default=None, help="Sync only the named fixture") sync_cmd.add_argument("--fixtures-dir", default=None) sync_cmd.add_argument("-v", "--verbose", action="store_true") # ── interactive ─────────────────────────────────────────────── inter_cmd = sub.add_parser("interactive", help="Interactive journey session (human-in-the-loop)") inter_cmd.add_argument( "--fixture", "-f", help="Journey fixture to use (default: pick interactively)", ) inter_cmd.add_argument( "--model", "-m", default="github_copilot/gpt-5.3-codex", help="Model for the journey AI (default: github_copilot/gpt-5.3-codex)", ) inter_cmd.add_argument( "--judge-model", default="gpt-4o", help="Model for LLM judge (default: gpt-4o)", ) inter_cmd.add_argument( "--fixtures-dir", default=None, help="Path to fixtures directory (default: eval/fixtures/)", ) inter_cmd.add_argument( "--data-dir", default=None, help="Override sample data directory (e.g. path to private test files not in git)", ) inter_cmd.add_argument("-v", "--verbose", action="store_true") return parser.parse_args() def _fixtures_dir(arg: str | None) -> Path | None: if arg: return Path(arg) return None async def _cmd_run(args: argparse.Namespace) -> None: fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir)) if not fixtures: print("No fixtures found. Create YAML files in eval/fixtures/.") return if args.fixture: fixtures = [f for f in fixtures if f.name == args.fixture] if not fixtures: print(f"Fixture '{args.fixture}' not found.") return models = [m.strip() for m in args.models.split(",")] all_results = [] for fixture in fixtures: if args.mode and fixture.mode != args.mode: continue results = await run_fixture_eval( fixture, models=models, use_llm_judge=not args.no_judge, judge_model=args.judge_model, ) all_results.extend(results) print_results(all_results) def _cmd_list(args: argparse.Namespace) -> None: fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir)) journey_fixtures = discover_journey_fixtures(_fixtures_dir(args.fixtures_dir)) if not fixtures and not journey_fixtures: print("No fixtures found.") return if fixtures: print(f"\n{'[Agent Fixtures]'}") print(f"{'Name':<30} {'Mode':<6} {'Types':<25} {'Expected'}") print("-" * 90) for f in fixtures: types = ", ".join(f.data_types) n_expected = len(f.expected) + len(f.expected_classification) print(f"{f.name:<30} {f.mode:<6} {types:<25} {n_expected}") if journey_fixtures: print(f"\n{'[Journey Fixtures]'}") print(f"{'Name':<30} {'Types':<25} {'Messages':<10} {'Criteria'}") print("-" * 90) for f in journey_fixtures: types = ", ".join(f.data_types) print(f"{f.name:<30} {types:<25} {len(f.user_messages):<10} {len(f.expected_template_criteria)}") print() def _cmd_sync(args: argparse.Namespace) -> None: fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir)) journey_fixtures = discover_journey_fixtures(_fixtures_dir(args.fixtures_dir)) if args.fixture: fixtures = [f for f in fixtures if f.name == args.fixture] journey_fixtures = [f for f in journey_fixtures if f.name == args.fixture] if not fixtures and not journey_fixtures: print("No fixtures to sync.") return for fixture in fixtures: name = langfuse_eval.sync_fixture_to_dataset(fixture) if name: print(f"Synced: {fixture.name} → {name}") else: print(f"Skipped: {fixture.name} (Langfuse not configured)") for fixture in journey_fixtures: name = langfuse_eval.sync_journey_fixture_to_dataset(fixture) if name: print(f"Synced: {fixture.name} → {name}") else: print(f"Skipped: {fixture.name} (Langfuse not configured)") async def _cmd_interactive(args: argparse.Namespace) -> None: journey_fixtures = discover_journey_fixtures(_fixtures_dir(args.fixtures_dir)) if not journey_fixtures: print("No journey fixtures found. Create YAML files with type: journey in eval/fixtures/.") return if args.fixture: fixtures = [f for f in journey_fixtures if f.name == args.fixture] if not fixtures: print(f"Journey fixture '{args.fixture}' not found.") return fixture = fixtures[0] elif len(journey_fixtures) == 1: fixture = journey_fixtures[0] else: # Let user pick print("\nAvailable journey fixtures:") for i, f in enumerate(journey_fixtures, 1): print(f" {i}. {f.name} — {f.description[:60]}") print() try: choice = int(input("Pick a fixture number: ").strip()) - 1 fixture = journey_fixtures[choice] except (ValueError, IndexError, EOFError, KeyboardInterrupt): print("Invalid choice.") return await run_interactive( fixture, model=args.model, judge_model=args.judge_model, data_dir=Path(args.data_dir).resolve() if args.data_dir else None, ) def main() -> None: args = _parse_args() _setup_logging(args.verbose) if args.command == "run": asyncio.run(_cmd_run(args)) elif args.command == "interactive": asyncio.run(_cmd_interactive(args)) elif args.command == "list": _cmd_list(args) elif args.command == "sync": _cmd_sync(args) if __name__ == "__main__": main()