refactor(eval): 3-mode eval harness (step1/step2/full) with Langfuse fixes

- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants - Rewrite runner with _run_step1, _run_step2, _run_full dispatch - CLI: replace --variants with --mode flag - Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full) - Remove old freelance_invoices fixture - Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full) - Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode - Langfuse: post separate classification_precision/recall/f1 scores for full mode - Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1) - Langfuse: include step1_results in trace output - MockExecutor: mock async_session to bypass DB in full mode - Journey fixture: remove user_messages (only interactive test kept)
2026-03-24 16:18:51 +01:00
parent 63fa119543
commit d3f7099d93
13 changed files with 1409 additions and 439 deletions
--- a/services/batch-agent/eval/mock_executor.py
+++ b/services/batch-agent/eval/mock_executor.py
@@ -1,6 +1,6 @@
 """Mock executor — intercepts execute_on_client for offline E2E testing.

-Patches ``app.ws_context.execute_on_client`` so agent pipeline runs don't
+Patches ``execute_on_client`` at all usage sites so agent pipeline runs don't
 require a live Electron client or Redis.  Instead:

 - **Filesystem actions** (list_directory, read_file_content, get_file_metadata)
@@ -20,6 +20,7 @@ import uuid
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
+from contextlib import contextmanager, asynccontextmanager
 from unittest.mock import AsyncMock, patch


@@ -33,6 +34,30 @@ class Mutation:
    timestamp: float = field(default_factory=time.time)


+# ── Fake DB helpers (used to bypass async_session in full mode) ───────
+
+class _FakeRow:
+    """Mimics an AgentRunLog row returned by SQLAlchemy."""
+    id = 0
+    status = "running"
+    items_processed = 0
+    items_created = 0
+    errors: list[str] = []
+    completed_at = None
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        object.__setattr__(self, name, value)
+
+
+class _FakeResult:
+    """Mimics a SQLAlchemy ``Result`` with ``scalar_one_or_none``."""
+    def __init__(self, row: _FakeRow) -> None:
+        self._row = row
+
+    def scalar_one_or_none(self) -> _FakeRow:
+        return self._row
+
+
@dataclass
 class MockExecutor:
    """In-memory executor that replaces Redis-based tool round-trip.
@@ -77,12 +102,37 @@ class MockExecutor:

    # ── Context manager for patching ──────────────────────────────

+    @contextmanager
    def patch(self):
-        """Return an async context-manager that patches execute_on_client."""
-        return patch(
-            "app.ws_context.execute_on_client",
-            new=AsyncMock(side_effect=self._handle),
-        )
+        """Patch execute_on_client and DB session at all usage sites."""
+        mock_fn = AsyncMock(side_effect=self._handle)
+        targets = [
+            "shared.ws_context.execute_on_client",
+            "app.agent_runner.execute_on_client",
+            "app.agents.filesystem_agent.execute_on_client",
+        ]
+
+        # Mock async_session so run_local_agent / _finalize_run skip real DB
+        fake_row = _FakeRow()
+        fake_db = AsyncMock()
+        fake_db.commit = AsyncMock()
+        fake_db.refresh = AsyncMock()
+        fake_db.execute = AsyncMock(return_value=_FakeResult(fake_row))
+        fake_db.add = lambda obj: None  # noqa: ARG005
+
+        @asynccontextmanager
+        async def _fake_session():
+            yield fake_db
+
+        patches = [patch(t, new=mock_fn) for t in targets]
+        patches.append(patch("app.agent_runner.async_session", _fake_session))
+        for p in patches:
+            p.start()
+        try:
+            yield mock_fn
+        finally:
+            for p in patches:
+                p.stop()

    # ── Internal dispatch ─────────────────────────────────────────