refactor(eval): 3-mode eval harness (step1/step2/full) with Langfuse fixes

- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants
- Rewrite runner with _run_step1, _run_step2, _run_full dispatch
- CLI: replace --variants with --mode flag
- Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full)
- Remove old freelance_invoices fixture
- Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full)
- Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode
- Langfuse: post separate classification_precision/recall/f1 scores for full mode
- Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1)
- Langfuse: include step1_results in trace output
- MockExecutor: mock async_session to bypass DB in full mode
- Journey fixture: remove user_messages (only interactive test kept)
This commit is contained in:
Roberto Musso
2026-03-24 16:18:51 +01:00
parent 63fa119543
commit d3f7099d93
13 changed files with 1409 additions and 439 deletions

View File

@@ -1,6 +1,6 @@
"""Mock executor — intercepts execute_on_client for offline E2E testing.
Patches ``app.ws_context.execute_on_client`` so agent pipeline runs don't
Patches ``execute_on_client`` at all usage sites so agent pipeline runs don't
require a live Electron client or Redis. Instead:
- **Filesystem actions** (list_directory, read_file_content, get_file_metadata)
@@ -20,6 +20,7 @@ import uuid
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
from contextlib import contextmanager, asynccontextmanager
from unittest.mock import AsyncMock, patch
@@ -33,6 +34,30 @@ class Mutation:
timestamp: float = field(default_factory=time.time)
# ── Fake DB helpers (used to bypass async_session in full mode) ───────
class _FakeRow:
"""Mimics an AgentRunLog row returned by SQLAlchemy."""
id = 0
status = "running"
items_processed = 0
items_created = 0
errors: list[str] = []
completed_at = None
def __setattr__(self, name: str, value: Any) -> None:
object.__setattr__(self, name, value)
class _FakeResult:
"""Mimics a SQLAlchemy ``Result`` with ``scalar_one_or_none``."""
def __init__(self, row: _FakeRow) -> None:
self._row = row
def scalar_one_or_none(self) -> _FakeRow:
return self._row
@dataclass
class MockExecutor:
"""In-memory executor that replaces Redis-based tool round-trip.
@@ -77,12 +102,37 @@ class MockExecutor:
# ── Context manager for patching ──────────────────────────────
@contextmanager
def patch(self):
"""Return an async context-manager that patches execute_on_client."""
return patch(
"app.ws_context.execute_on_client",
new=AsyncMock(side_effect=self._handle),
)
"""Patch execute_on_client and DB session at all usage sites."""
mock_fn = AsyncMock(side_effect=self._handle)
targets = [
"shared.ws_context.execute_on_client",
"app.agent_runner.execute_on_client",
"app.agents.filesystem_agent.execute_on_client",
]
# Mock async_session so run_local_agent / _finalize_run skip real DB
fake_row = _FakeRow()
fake_db = AsyncMock()
fake_db.commit = AsyncMock()
fake_db.refresh = AsyncMock()
fake_db.execute = AsyncMock(return_value=_FakeResult(fake_row))
fake_db.add = lambda obj: None # noqa: ARG005
@asynccontextmanager
async def _fake_session():
yield fake_db
patches = [patch(t, new=mock_fn) for t in targets]
patches.append(patch("app.agent_runner.async_session", _fake_session))
for p in patches:
p.start()
try:
yield mock_fn
finally:
for p in patches:
p.stop()
# ── Internal dispatch ─────────────────────────────────────────