refactor(eval): 3-mode eval harness (step1/step2/full) with Langfuse fixes

- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants - Rewrite runner with _run_step1, _run_step2, _run_full dispatch - CLI: replace --variants with --mode flag - Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full) - Remove old freelance_invoices fixture - Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full) - Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode - Langfuse: post separate classification_precision/recall/f1 scores for full mode - Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1) - Langfuse: include step1_results in trace output - MockExecutor: mock async_session to bypass DB in full mode - Journey fixture: remove user_messages (only interactive test kept)
2026-03-24 16:18:51 +01:00
parent 63fa119543
commit d3f7099d93
13 changed files with 1409 additions and 439 deletions
--- a/services/batch-agent/eval/journey_runner.py
+++ b/services/batch-agent/eval/journey_runner.py
@@ -94,7 +94,7 @@ async def _judge_template(

    Returns (criteria_scores, reasoning).
    """
-    from app.llm import get_llm
+    from shared.llm import get_llm

    llm = get_llm(model=judge_model, temperature=0)

@@ -152,13 +152,23 @@ async def run_single_journey_eval(
    model: str,
    *,
    judge_model: str = "gpt-4o-mini",
+    data_dir: Path | None = None,
 ) -> JourneyEvalResult:
-    """Execute one journey eval: start → messages → score template."""
+    """Execute one journey eval: start \u2192 messages \u2192 score template."""
    from shared.config import settings

-    # Build mock executor for filesystem tools
+    # When data_dir is given, use its parent as MockExecutor root
+    # and its name as the journey directory so the LLM sees a
+    # meaningful path (not ".").
+    if data_dir:
+        mock_root = data_dir.parent
+        journey_directory = data_dir.name
+    else:
+        mock_root = fixture.fixture_path.parent
+        journey_directory = fixture.directory
+
    mock = MockExecutor(
-        fixture_dir=fixture.fixture_dir,
+        fixture_dir=mock_root,
        seed_records={},
    )

@@ -178,7 +188,7 @@ async def run_single_journey_eval(
    done = False

    try:
-        from app.ws_context import set_current_user, clear_current_user
+        from shared.ws_context import set_current_user, clear_current_user
        from app.journey import handle_journey_start, handle_journey_message, _sessions

        set_current_user(eval_user_id)
@@ -186,7 +196,7 @@ async def run_single_journey_eval(
            # ── Start the journey ────────────────────────────────
            start_frame: dict[str, Any] = {
                "agent_type": "local",
-                "directory": fixture.directory,
+                "directory": journey_directory,
                "data_types": fixture.data_types,
                "session_id": f"eval-{uuid.uuid4().hex[:8]}",
            }
@@ -246,7 +256,7 @@ async def run_single_journey_eval(
        logger.error("journey_eval: pipeline failed for %s/%s: %s", fixture.name, model, exc)
    finally:
        settings.LLM_MODEL = original_model
-        from app.ws_context import clear_current_user
+        from shared.ws_context import clear_current_user
        clear_current_user()

    elapsed = time.time() - start_time
@@ -297,6 +307,7 @@ async def run_single_journey_eval(
        prompt_template=prompt_template or "(not generated)",
        actual_mutations=[{"conversation": conversation[:20]}],
        scores_summary=result.summary(),
+        langfuse_prompt_names=["journey_system"],
    )

    if trace_id:
@@ -321,6 +332,7 @@ async def run_journey_fixture_eval(
    models: list[str],
    *,
    judge_model: str = "gpt-4o-mini",
+    data_dir: Path | None = None,
 ) -> list[JourneyEvalResult]:
    """Run all models for a journey fixture."""
    langfuse_eval.sync_journey_fixture_to_dataset(fixture)
@@ -329,6 +341,7 @@ async def run_journey_fixture_eval(
    for model in models:
        result = await run_single_journey_eval(
            fixture, model, judge_model=judge_model,
+            data_dir=data_dir,
        )
        results.append(result)