refactor(eval): 3-mode eval harness (step1/step2/full) with Langfuse fixes

- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants - Rewrite runner with _run_step1, _run_step2, _run_full dispatch - CLI: replace --variants with --mode flag - Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full) - Remove old freelance_invoices fixture - Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full) - Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode - Langfuse: post separate classification_precision/recall/f1 scores for full mode - Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1) - Langfuse: include step1_results in trace output - MockExecutor: mock async_session to bypass DB in full mode - Journey fixture: remove user_messages (only interactive test kept)
2026-03-24 16:18:51 +01:00
parent 63fa119543
commit d3f7099d93
13 changed files with 1409 additions and 439 deletions
--- a/services/batch-agent/eval/runner.py
+++ b/services/batch-agent/eval/runner.py
@@ -1,28 +1,31 @@
 """Eval runner — orchestrates fixture → mock → agent pipeline → scoring.

-For each (fixture × model × prompt_variant) combination:
-1. Build a MockExecutor with fixture data
-2. Patch execute_on_client
-3. Override LLM_MODEL in shared settings
-4. Run the batch agent pipeline (run_local_agent)
-5. Collect mutations from the mock
-6. Score against expected results (field match + optional LLM judge)
-7. Report scores to Langfuse
-8. Print results
+Supports three eval modes:
+
+- **step1**: Test classification prompt only (``_STEP1_SYSTEM_PROMPT``).
+  Calls the LLM with fixture-provided ``domain_definitions`` and
+  ``projects_list`` and compares output against ``expected_classification``.
+
+- **step2**: Test processing prompt only (``_PROCESSING_SYSTEM_PROMPT``).
+  Compiles the prompt with fixture-provided ``existing_context``,
+  ``project_context``, ``data_types``, and ``custom_prompt_section``,
+  then runs the tool-calling loop.  Mutations are scored against
+  ``expected`` records.
+
+- **full**: Run ``run_local_agent()`` end-to-end (both steps).
+  Scored on both classification and extraction.
 """

 from __future__ import annotations

-import asyncio
 import copy
 import json
 import logging
 import time
 import uuid
-from pathlib import Path
 from typing import Any

-from eval.config import EvalFixture, ExpectedRecord
+from eval.config import EvalFixture, ExpectedClassification
 from eval.mock_executor import MockExecutor
 from eval.scorer import (
    EvalScores,
@@ -36,72 +39,193 @@ from eval import langfuse_eval
 logger = logging.getLogger(__name__)


-async def run_single_eval(
+# ── Step 1 runner ─────────────────────────────────────────────────────────
+
+
+async def _run_step1(
    fixture: EvalFixture,
    model: str,
-    prompt_variant: str,
-    *,
-    use_llm_judge: bool = True,
-    judge_model: str = "gpt-4o-mini",
-) -> EvalScores:
-    """Execute one (fixture × model × prompt_variant) eval and return scores."""
-    from shared.config import settings
+    mock: MockExecutor,
+) -> list[dict[str, Any]]:
+    """Run step-1 classification for each expected file.

-    prompt_template = fixture.prompt_variants.get(prompt_variant, "")
+    Returns a list of result dicts:
+    ``[{file, project_id, domains, new_project_name}, ...]``
+    """
+    from app.agent_runner import _classify_file

-    # Build mock executor
-    seed = copy.deepcopy(fixture.seed_records)
-    mock = MockExecutor(
-        fixture_dir=fixture.fixture_dir,
-        seed_records=seed,
+    results: list[dict[str, Any]] = []
+    for ec in fixture.expected_classification:
+        # Read the file content through the mock
+        file_result = await mock._handle(
+            action="read_file_content",
+            data={"path": ec.file},
+        )
+        file_content: str = file_result.get("content", "")
+
+        project_id, domains, new_name = await _classify_file(
+            file_path=ec.file,
+            file_content=file_content,
+            projects=fixture.projects_list,
+            config_data_types=fixture.data_types,
+        )
+        results.append({
+            "file": ec.file,
+            "project_id": project_id,
+            "domains": domains,
+            "new_project_name": new_name,
+        })
+    return results
+
+
+def _score_step1(
+    fixture: EvalFixture,
+    results: list[dict[str, Any]],
+) -> tuple[float, float, float, str]:
+    """Score step-1 results. Returns (precision, recall, f1, reasoning)."""
+    if not fixture.expected_classification:
+        return 0.0, 0.0, 0.0, "No expected classifications"
+
+    total = len(fixture.expected_classification)
+    matched = 0
+    details: list[str] = []
+
+    for ec in fixture.expected_classification:
+        actual = next((r for r in results if r["file"] == ec.file), None)
+        if actual is None:
+            details.append(f"  MISS {ec.file}: not processed")
+            continue
+
+        pid_ok = actual["project_id"] == ec.project_id
+        domains_ok = set(actual["domains"]) == set(ec.domains) if ec.domains else True
+
+        if pid_ok and domains_ok:
+            matched += 1
+            details.append(f"  OK   {ec.file}: project={actual['project_id']}, domains={actual['domains']}")
+        else:
+            parts: list[str] = []
+            if not pid_ok:
+                parts.append(f"project expected={ec.project_id} got={actual['project_id']}")
+            if not domains_ok:
+                parts.append(f"domains expected={ec.domains} got={actual['domains']}")
+            details.append(f"  FAIL {ec.file}: {'; '.join(parts)}")
+
+    precision = matched / total if total > 0 else 0.0
+    recall = precision  # in step1, precision == recall (same denominator)
+    f1 = precision  # same
+    reasoning = "\n".join(details)
+    return precision, recall, f1, reasoning
+
+
+# ── Step 2 runner ─────────────────────────────────────────────────────────
+
+
+async def _run_step2(
+    fixture: EvalFixture,
+    model: str,
+    mock: MockExecutor,
+) -> None:
+    """Run step-2 processing for each file in the fixture directory.
+
+    Compiles ``_PROCESSING_SYSTEM_PROMPT`` with fixture-provided variables
+    and runs the tool-calling loop.  Mutations are captured by the mock.
+    """
+    from app.agent_runner import (
+        _PROCESSING_SYSTEM_PROMPT,
+        _build_processing_tools,
+        _run_agent_with_tools,
+        _MAX_PROCESSING_STEPS,
+    )
+    from app import tracing
+
+    # Compile the processing prompt with fixture variables
+    system_prompt = tracing.compile_prompt(
+        "batch_processing",
+        fallback=_PROCESSING_SYSTEM_PROMPT,
+        variables={
+            "existing_context": fixture.existing_context,
+            "project_context": fixture.project_context,
+            "data_types": ", ".join(fixture.data_types),
+            "custom_prompt_section": fixture.custom_prompt_section,
+        },
    )

-    # Override the LLM model for this run
-    original_model = settings.LLM_MODEL
-    settings.LLM_MODEL = model
+    tools = _build_processing_tools(fixture.data_types)
+
+    # Scan files in the fixture directory
+    file_entries = await mock._handle(
+        action="list_directory",
+        data={"path": fixture.directory},
+    )
+    for entry in file_entries.get("entries", []):
+        if entry.get("type") != "file":
+            continue
+        # Filter by extension if specified
+        if fixture.file_extensions:
+            ext = entry["name"].rsplit(".", 1)[-1] if "." in entry["name"] else ""
+            if ext not in fixture.file_extensions:
+                continue
+
+        file_result = await mock._handle(
+            action="read_file_content",
+            data={"path": entry["path"]},
+        )
+        file_content: str = file_result.get("content", "")
+        if not file_content.strip():
+            continue
+
+        await _run_agent_with_tools(
+            system_prompt=system_prompt,
+            user_message=(
+                f"Process this file and extract relevant information.\n\n"
+                f"File: {entry['path']}\n\nContent:\n{file_content}"
+            ),
+            tools=tools,
+            max_steps=_MAX_PROCESSING_STEPS,
+        )
+
+
+# ── Full runner ───────────────────────────────────────────────────────────
+
+
+async def _run_full(
+    fixture: EvalFixture,
+    model: str,
+    mock: MockExecutor,
+    user_id: str,
+) -> None:
+    """Run the full two-step pipeline via ``run_local_agent``."""
+    from app.agent_runner import run_local_agent

-    # Build trigger data (same shape as what redis_consumer delivers)
    trigger_data: dict[str, Any] = {
        "type": "agent_trigger",
        "directory": fixture.directory,
        "directory_paths": [fixture.directory],
        "data_types": fixture.data_types,
        "file_extensions": fixture.file_extensions,
-        "prompt_template": prompt_template,
+        "prompt_template": fixture.custom_prompt_section,
        "device_id": "eval-harness",
        "run_context": {
-            "agent_id": f"eval-{fixture.name}-{prompt_variant}",
-            "run_id": None,  # skip DB logging during eval
+            "agent_id": f"eval-{fixture.name}",
+            "run_id": None,
        },
    }

-    eval_user_id = f"eval-{uuid.uuid4().hex[:8]}"
+    with mock.patch():
+        await run_local_agent(user_id, trigger_data)

-    logger.info(
-        "eval: starting %s | model=%s | variant=%s",
-        fixture.name, model, prompt_variant,
-    )
-    start_time = time.time()

-    try:
-        # Patch execute_on_client + set user context, then run the pipeline
-        from app.ws_context import set_current_user, clear_current_user
-        from app.agent_runner import run_local_agent
+# ── Scoring helpers ───────────────────────────────────────────────────────

-        set_current_user(eval_user_id)
-        with mock.patch():
-            await run_local_agent(eval_user_id, trigger_data)
-    except Exception as exc:
-        logger.error("eval: pipeline failed for %s/%s/%s: %s", fixture.name, model, prompt_variant, exc)
-    finally:
-        settings.LLM_MODEL = original_model
-        from app.ws_context import clear_current_user
-        clear_current_user()

-    elapsed = time.time() - start_time
-    logger.info("eval: pipeline completed in %.1fs — %d mutations", elapsed, len(mock.mutations))
+def _score_mutations(
+    fixture: EvalFixture,
+    mock: MockExecutor,
+) -> tuple[list[FieldScore], float, float, float, int, int]:
+    """Score mutations against expected records.

-    # ── Score results ────────────────────────────────────────────
+    Returns (field_scores, precision, recall, f1, extra, missing).
+    """
    all_field_scores: list[FieldScore] = []
    total_expected = 0
    total_actual = 0
@@ -109,12 +233,10 @@ async def run_single_eval(
    total_extra = 0
    total_missing = 0

-    # Group expected by table
    expected_by_table: dict[str, list[dict]] = {}
    for rec in fixture.expected:
        expected_by_table.setdefault(rec.table, []).append(rec.fields)

-    # Compare against actual mutations (inserts + updates)
    tables = set(expected_by_table.keys()) | {m.table for m in mock.mutations}
    for table in tables:
        expected_records = expected_by_table.get(table, [])
@@ -131,49 +253,160 @@ async def run_single_eval(
        total_missing += missing

    precision, recall, f1 = compute_precision_recall(total_expected, total_actual, total_matched)
+    return all_field_scores, precision, recall, f1, total_extra, total_missing

-    scores = EvalScores(
-        fixture_name=fixture.name,
-        model=model,
-        prompt_variant=prompt_variant,
-        field_scores=all_field_scores,
-        precision=precision,
-        recall=recall,
-        f1=f1,
-        extra_records=total_extra,
-        missing_records=total_missing,
+
+# ── Main entry point ──────────────────────────────────────────────────────
+
+
+async def run_single_eval(
+    fixture: EvalFixture,
+    model: str,
+    *,
+    use_llm_judge: bool = True,
+    judge_model: str = "gpt-4o-mini",
+) -> EvalScores:
+    """Execute one eval run for a fixture + model.  Mode is read from the fixture."""
+    from shared.config import settings
+    from shared.ws_context import set_current_user, clear_current_user
+
+    seed = copy.deepcopy(fixture.seed_records)
+    mock = MockExecutor(
+        fixture_dir=fixture.fixture_path.parent,
+        seed_records=seed,
    )

-    # ── Optional LLM judge ───────────────────────────────────────
-    if use_llm_judge and fixture.expected:
-        all_expected = [r.fields for r in fixture.expected]
-        all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")]
-        judge_score, reasoning = await llm_judge_score(
-            all_expected, all_actual, judge_model=judge_model,
-        )
-        scores.llm_judge_score = judge_score
-        scores.llm_judge_reasoning = reasoning
+    original_model = settings.LLM_MODEL
+    settings.LLM_MODEL = model
+    eval_user_id = str(uuid.uuid4())

-    # ── Report to Langfuse ───────────────────────────────────────
-    dataset_name = f"batch-eval-{fixture.name}"
-    dataset_item_id = f"{fixture.name}--{prompt_variant}"
-    run_name = f"{model}--{prompt_variant}--{int(time.time())}"
+    logger.info(
+        "eval: starting %s | mode=%s | model=%s",
+        fixture.name, fixture.mode, model,
+    )
+    start_time = time.time()
+
+    step1_results: list[dict[str, Any]] = []
+    step1_reasoning = ""
+
+    try:
+        set_current_user(eval_user_id)
+
+        if fixture.mode == "step1":
+            with mock.patch():
+                step1_results = await _run_step1(fixture, model, mock)
+
+        elif fixture.mode == "step2":
+            with mock.patch():
+                await _run_step2(fixture, model, mock)
+
+        elif fixture.mode == "full":
+            with mock.patch():
+                # Step 1 — classification (independent from run_local_agent)
+                if fixture.expected_classification:
+                    step1_results = await _run_step1(fixture, model, mock)
+
+            # Step 2 — full pipeline (run_local_agent handles both steps)
+            await _run_full(fixture, model, mock, eval_user_id)
+
+    except Exception as exc:
+        logger.error("eval: pipeline failed for %s/%s: %s", fixture.name, model, exc)
+    finally:
+        settings.LLM_MODEL = original_model
+        clear_current_user()
+
+    elapsed = time.time() - start_time
+    logger.info("eval: completed in %.1fs — %d mutations", elapsed, len(mock.mutations))
+
+    # ── Score ─────────────────────────────────────────────────────
+
+    if fixture.mode == "step1":
+        s1_precision, s1_recall, s1_f1, step1_reasoning = _score_step1(fixture, step1_results)
+        scores = EvalScores(
+            fixture_name=fixture.name,
+            model=model,
+            prompt_variant=fixture.mode,
+            precision=s1_precision,
+            recall=s1_recall,
+            f1=s1_f1,
+            llm_judge_reasoning=step1_reasoning,
+        )
+    else:
+        # step2 or full — score mutations
+        field_scores, precision, recall, f1, extra, missing = _score_mutations(fixture, mock)
+        scores = EvalScores(
+            fixture_name=fixture.name,
+            model=model,
+            prompt_variant=fixture.mode,
+            field_scores=field_scores,
+            precision=precision,
+            recall=recall,
+            f1=f1,
+            extra_records=extra,
+            missing_records=missing,
+        )
+
+        # Add step1 classification scores for full mode
+        if fixture.mode == "full" and fixture.expected_classification:
+            s1_p, s1_r, s1_f1, step1_reasoning = _score_step1(fixture, step1_results)
+            scores.llm_judge_reasoning = f"Step1 classification:\n{step1_reasoning}"
+
+        # Optional LLM judge for extraction quality
+        if use_llm_judge and fixture.expected:
+            all_expected = [r.fields for r in fixture.expected]
+            all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")]
+            judge_score, reasoning = await llm_judge_score(
+                all_expected, all_actual, judge_model=judge_model,
+            )
+            scores.llm_judge_score = judge_score
+            if step1_reasoning:
+                scores.llm_judge_reasoning += f"\n\nLLM judge:\n{reasoning}"
+            else:
+                scores.llm_judge_reasoning = reasoning
+
+    # ── Report to Langfuse ────────────────────────────────────────
+    prompt_names = {
+        "step1": ["batch_file_classifier"],
+        "step2": ["batch_processing"],
+        "full": ["batch_file_classifier", "batch_processing"],
+    }.get(fixture.mode, ["batch_processing"])

    trace_id = langfuse_eval.log_eval_trace(
        fixture_name=fixture.name,
        model=model,
-        prompt_variant=prompt_variant,
-        prompt_template=prompt_template,
+        prompt_variant=fixture.mode,
+        prompt_template=fixture.custom_prompt_section or "(default)",
        actual_mutations=[{"action": m.action, "table": m.table, "data": m.data} for m in mock.mutations],
        scores_summary=scores.summary(),
-        dataset_name=dataset_name,
-        run_name=run_name,
-        dataset_item_id=dataset_item_id,
+        step1_results=step1_results or None,
+        langfuse_prompt_names=prompt_names,
    )

    if trace_id:
        langfuse_eval.post_eval_scores(scores, trace_id=trace_id)

+        # For full mode, post classification scores separately
+        if fixture.mode == "full" and fixture.expected_classification:
+            s1_p, s1_r, s1_f1, _ = _score_step1(fixture, step1_results)
+            for name, value in [
+                ("classification_precision", s1_p),
+                ("classification_recall", s1_r),
+                ("classification_f1", s1_f1),
+            ]:
+                try:
+                    from langfuse import get_client
+                    lf = get_client()
+                    if lf:
+                        lf.create_score(
+                            name=name,
+                            value=value,
+                            trace_id=trace_id,
+                            data_type="NUMERIC",
+                            comment=f"{fixture.name} | {model} | full",
+                        )
+                except Exception:
+                    pass
+
    return scores


@@ -181,29 +414,20 @@ async def run_fixture_eval(
    fixture: EvalFixture,
    models: list[str],
    *,
-    variants: list[str] | None = None,
    use_llm_judge: bool = True,
    judge_model: str = "gpt-4o-mini",
 ) -> list[EvalScores]:
-    """Run all (model × variant) combinations for a fixture."""
-    if variants is None:
-        variants = list(fixture.prompt_variants.keys())
-
-    # Sync fixture to Langfuse dataset
+    """Run all models for a fixture."""
    langfuse_eval.sync_fixture_to_dataset(fixture)

    results: list[EvalScores] = []
    for model in models:
-        for variant in variants:
-            if variant not in fixture.prompt_variants:
-                logger.warning("eval: variant %r not found in fixture %s", variant, fixture.name)
-                continue
-            scores = await run_single_eval(
-                fixture, model, variant,
-                use_llm_judge=use_llm_judge,
-                judge_model=judge_model,
-            )
-            results.append(scores)
+        scores = await run_single_eval(
+            fixture, model,
+            use_llm_judge=use_llm_judge,
+            judge_model=judge_model,
+        )
+        results.append(scores)

    return results

@@ -214,18 +438,21 @@ def print_results(results: list[EvalScores]) -> None:
        print("\nNo eval results.")
        return

-    print("\n" + "=" * 90)
-    print(f"{'Fixture':<25} {'Model':<25} {'Variant':<15} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}")
-    print("-" * 90)
+    print("\n" + "=" * 95)
+    print(f"{'Fixture':<25} {'Mode':<6} {'Model':<25} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}")
+    print("-" * 95)

    for s in results:
        llm_str = f"{s.llm_judge_score:.2f}" if s.llm_judge_score is not None else "  --"
        print(
-            f"{s.fixture_name:<25} {s.model:<25} {s.prompt_variant:<15} "
+            f"{s.fixture_name:<25} {s.prompt_variant:<6} {s.model:<25} "
            f"{s.precision:>6.2f} {s.recall:>6.2f} {s.f1:>6.2f} "
            f"{s.field_accuracy:>6.2f} {llm_str:>6}"
        )

+    print("=" * 95)
+    print()
+
    print("=" * 90)

    # If LLM judge reasoning is available, print it