refactor(eval): 3-mode eval harness (step1/step2/full) with Langfuse fixes

- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants - Rewrite runner with _run_step1, _run_step2, _run_full dispatch - CLI: replace --variants with --mode flag - Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full) - Remove old freelance_invoices fixture - Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full) - Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode - Langfuse: post separate classification_precision/recall/f1 scores for full mode - Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1) - Langfuse: include step1_results in trace output - MockExecutor: mock async_session to bypass DB in full mode - Journey fixture: remove user_messages (only interactive test kept)
2026-03-24 16:18:51 +01:00
parent 63fa119543
commit d3f7099d93
13 changed files with 1409 additions and 439 deletions
--- a/services/batch-agent/eval/cli.py
+++ b/services/batch-agent/eval/cli.py
@@ -4,14 +4,15 @@ Usage::

    # From services/batch-agent/:
    python -m eval run                                # all agent fixtures, default model
-    python -m eval run --fixture=freelance-invoices   # single fixture
-    python -m eval run --models=gpt-4o,anthropic/claude-sonnet-4
-    python -m eval run --variants=baseline,detailed   # specific prompt variants
+    python -m eval run --fixture=classify-invoices    # single fixture
+    python -m eval run --models=gpt-4o,gpt-5.3-codex  # multiple models
+    python -m eval run --mode=step1                   # only step1 fixtures
    python -m eval run --no-judge                     # skip LLM judge scoring

-    python -m eval journey                            # all journey fixtures
-    python -m eval journey --fixture=journey-invoices # single journey fixture
-    python -m eval journey --models=gpt-4o,anthropic/claude-sonnet-4
+    python -m eval interactive                        # interactive journey session
+    python -m eval interactive --fixture=journey-invoice-setup
+    python -m eval interactive --model=gpt-4o
+    python -m eval interactive --judge-model=github_copilot/gpt-4o-mini

    python -m eval list                               # list all fixtures
    python -m eval sync                               # sync fixtures to Langfuse datasets
@@ -25,16 +26,24 @@ import logging
 import sys
 from pathlib import Path

-# Ensure the service root and repo root are in sys.path
+# Ensure the service root and repo root are in sys.path.
+# Service root must come BEFORE repo root so its ``app/`` package
+# shadows the monolith ``app/`` in the repo root.
 _SERVICE_ROOT = Path(__file__).resolve().parent.parent
 _REPO_ROOT = _SERVICE_ROOT.parent.parent
-for p in (_SERVICE_ROOT, _REPO_ROOT):
-    if str(p) not in sys.path:
-        sys.path.insert(0, str(p))
+_sr = str(_SERVICE_ROOT)
+_rr = str(_REPO_ROOT)
+if _rr not in sys.path:
+    sys.path.insert(0, _rr)
+# Always force service root to position 0 (python -m may have already
+# added CWD further down the list, which loses to repo root).
+if _sr in sys.path:
+    sys.path.remove(_sr)
+sys.path.insert(0, _sr)

 from eval.config import discover_fixtures, discover_journey_fixtures
 from eval.runner import run_fixture_eval, print_results
-from eval.journey_runner import run_journey_fixture_eval, print_journey_results
+from eval.interactive import run_interactive
 from eval import langfuse_eval


@@ -65,13 +74,14 @@ def _parse_args() -> argparse.Namespace:
    )
    run_cmd.add_argument(
        "--models", "-m",
-        default="gpt-4o",
-        help="Comma-separated list of models to test (default: gpt-4o)",
+        default="github_copilot/gpt-5.3-codex",
+        help="Comma-separated list of models to test (default: github_copilot/gpt-5.3-codex)",
    )
    run_cmd.add_argument(
-        "--variants", "-p",
+        "--mode",
        default=None,
-        help="Comma-separated prompt variants to test (default: all in fixture)",
+        choices=["step1", "step2", "full"],
+        help="Only run fixtures with this mode (default: all)",
    )
    run_cmd.add_argument(
        "--no-judge",
@@ -80,8 +90,8 @@ def _parse_args() -> argparse.Namespace:
    )
    run_cmd.add_argument(
        "--judge-model",
-        default="gpt-4o-mini",
-        help="Model for LLM judge (default: gpt-4o-mini)",
+        default="gpt-4o",
+        help="Model for LLM judge (default: gpt-4o)",
    )
    run_cmd.add_argument(
        "--fixtures-dir",
@@ -95,35 +105,40 @@ def _parse_args() -> argparse.Namespace:
    list_cmd.add_argument("--fixtures-dir", default=None)
    list_cmd.add_argument("-v", "--verbose", action="store_true")

-    # ── journey ───────────────────────────────────────────────────
-    journey_cmd = sub.add_parser("journey", help="Run journey evaluations")
-    journey_cmd.add_argument(
-        "--fixture", "-f",
-        help="Run only the named journey fixture (default: all)",
-    )
-    journey_cmd.add_argument(
-        "--models", "-m",
-        default="gpt-4o",
-        help="Comma-separated list of models to test (default: gpt-4o)",
-    )
-    journey_cmd.add_argument(
-        "--judge-model",
-        default="gpt-4o-mini",
-        help="Model for LLM judge (default: gpt-4o-mini)",
-    )
-    journey_cmd.add_argument(
-        "--fixtures-dir",
-        default=None,
-        help="Path to fixtures directory (default: eval/fixtures/)",
-    )
-    journey_cmd.add_argument("-v", "--verbose", action="store_true")
-
    # ── sync ──────────────────────────────────────────────────────
    sync_cmd = sub.add_parser("sync", help="Sync fixtures to Langfuse datasets")
    sync_cmd.add_argument("--fixture", "-f", default=None, help="Sync only the named fixture")
    sync_cmd.add_argument("--fixtures-dir", default=None)
    sync_cmd.add_argument("-v", "--verbose", action="store_true")

+    # ── interactive ───────────────────────────────────────────────
+    inter_cmd = sub.add_parser("interactive", help="Interactive journey session (human-in-the-loop)")
+    inter_cmd.add_argument(
+        "--fixture", "-f",
+        help="Journey fixture to use (default: pick interactively)",
+    )
+    inter_cmd.add_argument(
+        "--model", "-m",
+        default="github_copilot/gpt-5.3-codex",
+        help="Model for the journey AI (default: github_copilot/gpt-5.3-codex)",
+    )
+    inter_cmd.add_argument(
+        "--judge-model",
+        default="gpt-4o",
+        help="Model for LLM judge (default: gpt-4o)",
+    )
+    inter_cmd.add_argument(
+        "--fixtures-dir",
+        default=None,
+        help="Path to fixtures directory (default: eval/fixtures/)",
+    )
+    inter_cmd.add_argument(
+        "--data-dir",
+        default=None,
+        help="Override sample data directory (e.g. path to private test files not in git)",
+    )
+    inter_cmd.add_argument("-v", "--verbose", action="store_true")
+
    return parser.parse_args()


@@ -146,14 +161,14 @@ async def _cmd_run(args: argparse.Namespace) -> None:
            return

    models = [m.strip() for m in args.models.split(",")]
-    variants = [v.strip() for v in args.variants.split(",")] if args.variants else None

    all_results = []
    for fixture in fixtures:
+        if args.mode and fixture.mode != args.mode:
+            continue
        results = await run_fixture_eval(
            fixture,
            models=models,
-            variants=variants,
            use_llm_judge=not args.no_judge,
            judge_model=args.judge_model,
        )
@@ -172,12 +187,12 @@ def _cmd_list(args: argparse.Namespace) -> None:

    if fixtures:
        print(f"\n{'[Agent Fixtures]'}")
-        print(f"{'Name':<30} {'Types':<25} {'Variants':<20} {'Expected'}")
+        print(f"{'Name':<30} {'Mode':<6} {'Types':<25} {'Expected'}")
        print("-" * 90)
        for f in fixtures:
-            variants = ", ".join(f.prompt_variants.keys())
            types = ", ".join(f.data_types)
-            print(f"{f.name:<30} {types:<25} {variants:<20} {len(f.expected)}")
+            n_expected = len(f.expected) + len(f.expected_classification)
+            print(f"{f.name:<30} {f.mode:<6} {types:<25} {n_expected}")

    if journey_fixtures:
        print(f"\n{'[Journey Fixtures]'}")
@@ -217,30 +232,39 @@ def _cmd_sync(args: argparse.Namespace) -> None:
            print(f"Skipped: {fixture.name} (Langfuse not configured)")


-async def _cmd_journey(args: argparse.Namespace) -> None:
+async def _cmd_interactive(args: argparse.Namespace) -> None:
    journey_fixtures = discover_journey_fixtures(_fixtures_dir(args.fixtures_dir))
    if not journey_fixtures:
        print("No journey fixtures found. Create YAML files with type: journey in eval/fixtures/.")
        return

    if args.fixture:
-        journey_fixtures = [f for f in journey_fixtures if f.name == args.fixture]
-        if not journey_fixtures:
+        fixtures = [f for f in journey_fixtures if f.name == args.fixture]
+        if not fixtures:
            print(f"Journey fixture '{args.fixture}' not found.")
            return
+        fixture = fixtures[0]
+    elif len(journey_fixtures) == 1:
+        fixture = journey_fixtures[0]
+    else:
+        # Let user pick
+        print("\nAvailable journey fixtures:")
+        for i, f in enumerate(journey_fixtures, 1):
+            print(f"  {i}. {f.name} — {f.description[:60]}")
+        print()
+        try:
+            choice = int(input("Pick a fixture number: ").strip()) - 1
+            fixture = journey_fixtures[choice]
+        except (ValueError, IndexError, EOFError, KeyboardInterrupt):
+            print("Invalid choice.")
+            return

-    models = [m.strip() for m in args.models.split(",")]
-
-    all_results = []
-    for fixture in journey_fixtures:
-        results = await run_journey_fixture_eval(
-            fixture,
-            models=models,
-            judge_model=args.judge_model,
-        )
-        all_results.extend(results)
-
-    print_journey_results(all_results)
+    await run_interactive(
+        fixture,
+        model=args.model,
+        judge_model=args.judge_model,
+        data_dir=Path(args.data_dir).resolve() if args.data_dir else None,
+    )


 def main() -> None:
@@ -249,8 +273,8 @@ def main() -> None:

    if args.command == "run":
        asyncio.run(_cmd_run(args))
-    elif args.command == "journey":
-        asyncio.run(_cmd_journey(args))
+    elif args.command == "interactive":
+        asyncio.run(_cmd_interactive(args))
    elif args.command == "list":
        _cmd_list(args)
    elif args.command == "sync":