- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants - Rewrite runner with _run_step1, _run_step2, _run_full dispatch - CLI: replace --variants with --mode flag - Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full) - Remove old freelance_invoices fixture - Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full) - Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode - Langfuse: post separate classification_precision/recall/f1 scores for full mode - Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1) - Langfuse: include step1_results in trace output - MockExecutor: mock async_session to bypass DB in full mode - Journey fixture: remove user_messages (only interactive test kept)
286 lines
10 KiB
Python
286 lines
10 KiB
Python
"""CLI entry point for the batch agent evaluation harness.
|
|
|
|
Usage::
|
|
|
|
# From services/batch-agent/:
|
|
python -m eval run # all agent fixtures, default model
|
|
python -m eval run --fixture=classify-invoices # single fixture
|
|
python -m eval run --models=gpt-4o,gpt-5.3-codex # multiple models
|
|
python -m eval run --mode=step1 # only step1 fixtures
|
|
python -m eval run --no-judge # skip LLM judge scoring
|
|
|
|
python -m eval interactive # interactive journey session
|
|
python -m eval interactive --fixture=journey-invoice-setup
|
|
python -m eval interactive --model=gpt-4o
|
|
python -m eval interactive --judge-model=github_copilot/gpt-4o-mini
|
|
|
|
python -m eval list # list all fixtures
|
|
python -m eval sync # sync fixtures to Langfuse datasets
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import asyncio
|
|
import logging
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Ensure the service root and repo root are in sys.path.
|
|
# Service root must come BEFORE repo root so its ``app/`` package
|
|
# shadows the monolith ``app/`` in the repo root.
|
|
_SERVICE_ROOT = Path(__file__).resolve().parent.parent
|
|
_REPO_ROOT = _SERVICE_ROOT.parent.parent
|
|
_sr = str(_SERVICE_ROOT)
|
|
_rr = str(_REPO_ROOT)
|
|
if _rr not in sys.path:
|
|
sys.path.insert(0, _rr)
|
|
# Always force service root to position 0 (python -m may have already
|
|
# added CWD further down the list, which loses to repo root).
|
|
if _sr in sys.path:
|
|
sys.path.remove(_sr)
|
|
sys.path.insert(0, _sr)
|
|
|
|
from eval.config import discover_fixtures, discover_journey_fixtures
|
|
from eval.runner import run_fixture_eval, print_results
|
|
from eval.interactive import run_interactive
|
|
from eval import langfuse_eval
|
|
|
|
|
|
def _setup_logging(verbose: bool) -> None:
|
|
level = logging.DEBUG if verbose else logging.INFO
|
|
logging.basicConfig(
|
|
level=level,
|
|
format="%(asctime)s %(name)-20s %(levelname)-5s %(message)s",
|
|
datefmt="%H:%M:%S",
|
|
)
|
|
# Quiet noisy libraries
|
|
for name in ("httpx", "httpcore", "openai", "litellm", "urllib3"):
|
|
logging.getLogger(name).setLevel(logging.WARNING)
|
|
|
|
|
|
def _parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(
|
|
description="Batch Agent E2E evaluation harness",
|
|
prog="python -m eval",
|
|
)
|
|
sub = parser.add_subparsers(dest="command", required=True)
|
|
|
|
# ── run ───────────────────────────────────────────────────────
|
|
run_cmd = sub.add_parser("run", help="Run evaluations")
|
|
run_cmd.add_argument(
|
|
"--fixture", "-f",
|
|
help="Run only the named fixture (default: all)",
|
|
)
|
|
run_cmd.add_argument(
|
|
"--models", "-m",
|
|
default="github_copilot/gpt-5.3-codex",
|
|
help="Comma-separated list of models to test (default: github_copilot/gpt-5.3-codex)",
|
|
)
|
|
run_cmd.add_argument(
|
|
"--mode",
|
|
default=None,
|
|
choices=["step1", "step2", "full"],
|
|
help="Only run fixtures with this mode (default: all)",
|
|
)
|
|
run_cmd.add_argument(
|
|
"--no-judge",
|
|
action="store_true",
|
|
help="Skip LLM-as-judge scoring",
|
|
)
|
|
run_cmd.add_argument(
|
|
"--judge-model",
|
|
default="gpt-4o",
|
|
help="Model for LLM judge (default: gpt-4o)",
|
|
)
|
|
run_cmd.add_argument(
|
|
"--fixtures-dir",
|
|
default=None,
|
|
help="Path to fixtures directory (default: eval/fixtures/)",
|
|
)
|
|
run_cmd.add_argument("-v", "--verbose", action="store_true")
|
|
|
|
# ── list ──────────────────────────────────────────────────────
|
|
list_cmd = sub.add_parser("list", help="List available fixtures")
|
|
list_cmd.add_argument("--fixtures-dir", default=None)
|
|
list_cmd.add_argument("-v", "--verbose", action="store_true")
|
|
|
|
# ── sync ──────────────────────────────────────────────────────
|
|
sync_cmd = sub.add_parser("sync", help="Sync fixtures to Langfuse datasets")
|
|
sync_cmd.add_argument("--fixture", "-f", default=None, help="Sync only the named fixture")
|
|
sync_cmd.add_argument("--fixtures-dir", default=None)
|
|
sync_cmd.add_argument("-v", "--verbose", action="store_true")
|
|
|
|
# ── interactive ───────────────────────────────────────────────
|
|
inter_cmd = sub.add_parser("interactive", help="Interactive journey session (human-in-the-loop)")
|
|
inter_cmd.add_argument(
|
|
"--fixture", "-f",
|
|
help="Journey fixture to use (default: pick interactively)",
|
|
)
|
|
inter_cmd.add_argument(
|
|
"--model", "-m",
|
|
default="github_copilot/gpt-5.3-codex",
|
|
help="Model for the journey AI (default: github_copilot/gpt-5.3-codex)",
|
|
)
|
|
inter_cmd.add_argument(
|
|
"--judge-model",
|
|
default="gpt-4o",
|
|
help="Model for LLM judge (default: gpt-4o)",
|
|
)
|
|
inter_cmd.add_argument(
|
|
"--fixtures-dir",
|
|
default=None,
|
|
help="Path to fixtures directory (default: eval/fixtures/)",
|
|
)
|
|
inter_cmd.add_argument(
|
|
"--data-dir",
|
|
default=None,
|
|
help="Override sample data directory (e.g. path to private test files not in git)",
|
|
)
|
|
inter_cmd.add_argument("-v", "--verbose", action="store_true")
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
def _fixtures_dir(arg: str | None) -> Path | None:
|
|
if arg:
|
|
return Path(arg)
|
|
return None
|
|
|
|
|
|
async def _cmd_run(args: argparse.Namespace) -> None:
|
|
fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir))
|
|
if not fixtures:
|
|
print("No fixtures found. Create YAML files in eval/fixtures/.")
|
|
return
|
|
|
|
if args.fixture:
|
|
fixtures = [f for f in fixtures if f.name == args.fixture]
|
|
if not fixtures:
|
|
print(f"Fixture '{args.fixture}' not found.")
|
|
return
|
|
|
|
models = [m.strip() for m in args.models.split(",")]
|
|
|
|
all_results = []
|
|
for fixture in fixtures:
|
|
if args.mode and fixture.mode != args.mode:
|
|
continue
|
|
results = await run_fixture_eval(
|
|
fixture,
|
|
models=models,
|
|
use_llm_judge=not args.no_judge,
|
|
judge_model=args.judge_model,
|
|
)
|
|
all_results.extend(results)
|
|
|
|
print_results(all_results)
|
|
|
|
|
|
def _cmd_list(args: argparse.Namespace) -> None:
|
|
fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir))
|
|
journey_fixtures = discover_journey_fixtures(_fixtures_dir(args.fixtures_dir))
|
|
|
|
if not fixtures and not journey_fixtures:
|
|
print("No fixtures found.")
|
|
return
|
|
|
|
if fixtures:
|
|
print(f"\n{'[Agent Fixtures]'}")
|
|
print(f"{'Name':<30} {'Mode':<6} {'Types':<25} {'Expected'}")
|
|
print("-" * 90)
|
|
for f in fixtures:
|
|
types = ", ".join(f.data_types)
|
|
n_expected = len(f.expected) + len(f.expected_classification)
|
|
print(f"{f.name:<30} {f.mode:<6} {types:<25} {n_expected}")
|
|
|
|
if journey_fixtures:
|
|
print(f"\n{'[Journey Fixtures]'}")
|
|
print(f"{'Name':<30} {'Types':<25} {'Messages':<10} {'Criteria'}")
|
|
print("-" * 90)
|
|
for f in journey_fixtures:
|
|
types = ", ".join(f.data_types)
|
|
print(f"{f.name:<30} {types:<25} {len(f.user_messages):<10} {len(f.expected_template_criteria)}")
|
|
|
|
print()
|
|
|
|
|
|
def _cmd_sync(args: argparse.Namespace) -> None:
|
|
fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir))
|
|
journey_fixtures = discover_journey_fixtures(_fixtures_dir(args.fixtures_dir))
|
|
|
|
if args.fixture:
|
|
fixtures = [f for f in fixtures if f.name == args.fixture]
|
|
journey_fixtures = [f for f in journey_fixtures if f.name == args.fixture]
|
|
|
|
if not fixtures and not journey_fixtures:
|
|
print("No fixtures to sync.")
|
|
return
|
|
|
|
for fixture in fixtures:
|
|
name = langfuse_eval.sync_fixture_to_dataset(fixture)
|
|
if name:
|
|
print(f"Synced: {fixture.name} → {name}")
|
|
else:
|
|
print(f"Skipped: {fixture.name} (Langfuse not configured)")
|
|
|
|
for fixture in journey_fixtures:
|
|
name = langfuse_eval.sync_journey_fixture_to_dataset(fixture)
|
|
if name:
|
|
print(f"Synced: {fixture.name} → {name}")
|
|
else:
|
|
print(f"Skipped: {fixture.name} (Langfuse not configured)")
|
|
|
|
|
|
async def _cmd_interactive(args: argparse.Namespace) -> None:
|
|
journey_fixtures = discover_journey_fixtures(_fixtures_dir(args.fixtures_dir))
|
|
if not journey_fixtures:
|
|
print("No journey fixtures found. Create YAML files with type: journey in eval/fixtures/.")
|
|
return
|
|
|
|
if args.fixture:
|
|
fixtures = [f for f in journey_fixtures if f.name == args.fixture]
|
|
if not fixtures:
|
|
print(f"Journey fixture '{args.fixture}' not found.")
|
|
return
|
|
fixture = fixtures[0]
|
|
elif len(journey_fixtures) == 1:
|
|
fixture = journey_fixtures[0]
|
|
else:
|
|
# Let user pick
|
|
print("\nAvailable journey fixtures:")
|
|
for i, f in enumerate(journey_fixtures, 1):
|
|
print(f" {i}. {f.name} — {f.description[:60]}")
|
|
print()
|
|
try:
|
|
choice = int(input("Pick a fixture number: ").strip()) - 1
|
|
fixture = journey_fixtures[choice]
|
|
except (ValueError, IndexError, EOFError, KeyboardInterrupt):
|
|
print("Invalid choice.")
|
|
return
|
|
|
|
await run_interactive(
|
|
fixture,
|
|
model=args.model,
|
|
judge_model=args.judge_model,
|
|
data_dir=Path(args.data_dir).resolve() if args.data_dir else None,
|
|
)
|
|
|
|
|
|
def main() -> None:
|
|
args = _parse_args()
|
|
_setup_logging(args.verbose)
|
|
|
|
if args.command == "run":
|
|
asyncio.run(_cmd_run(args))
|
|
elif args.command == "interactive":
|
|
asyncio.run(_cmd_interactive(args))
|
|
elif args.command == "list":
|
|
_cmd_list(args)
|
|
elif args.command == "sync":
|
|
_cmd_sync(args)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|