refactor(eval): 3-mode eval harness (step1/step2/full) with Langfuse fixes
- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants - Rewrite runner with _run_step1, _run_step2, _run_full dispatch - CLI: replace --variants with --mode flag - Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full) - Remove old freelance_invoices fixture - Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full) - Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode - Langfuse: post separate classification_precision/recall/f1 scores for full mode - Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1) - Langfuse: include step1_results in trace output - MockExecutor: mock async_session to bypass DB in full mode - Journey fixture: remove user_messages (only interactive test kept)
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
"""Mock executor — intercepts execute_on_client for offline E2E testing.
|
||||
|
||||
Patches ``app.ws_context.execute_on_client`` so agent pipeline runs don't
|
||||
Patches ``execute_on_client`` at all usage sites so agent pipeline runs don't
|
||||
require a live Electron client or Redis. Instead:
|
||||
|
||||
- **Filesystem actions** (list_directory, read_file_content, get_file_metadata)
|
||||
@@ -20,6 +20,7 @@ import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from contextlib import contextmanager, asynccontextmanager
|
||||
from unittest.mock import AsyncMock, patch
|
||||
|
||||
|
||||
@@ -33,6 +34,30 @@ class Mutation:
|
||||
timestamp: float = field(default_factory=time.time)
|
||||
|
||||
|
||||
# ── Fake DB helpers (used to bypass async_session in full mode) ───────
|
||||
|
||||
class _FakeRow:
|
||||
"""Mimics an AgentRunLog row returned by SQLAlchemy."""
|
||||
id = 0
|
||||
status = "running"
|
||||
items_processed = 0
|
||||
items_created = 0
|
||||
errors: list[str] = []
|
||||
completed_at = None
|
||||
|
||||
def __setattr__(self, name: str, value: Any) -> None:
|
||||
object.__setattr__(self, name, value)
|
||||
|
||||
|
||||
class _FakeResult:
|
||||
"""Mimics a SQLAlchemy ``Result`` with ``scalar_one_or_none``."""
|
||||
def __init__(self, row: _FakeRow) -> None:
|
||||
self._row = row
|
||||
|
||||
def scalar_one_or_none(self) -> _FakeRow:
|
||||
return self._row
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockExecutor:
|
||||
"""In-memory executor that replaces Redis-based tool round-trip.
|
||||
@@ -77,12 +102,37 @@ class MockExecutor:
|
||||
|
||||
# ── Context manager for patching ──────────────────────────────
|
||||
|
||||
@contextmanager
|
||||
def patch(self):
|
||||
"""Return an async context-manager that patches execute_on_client."""
|
||||
return patch(
|
||||
"app.ws_context.execute_on_client",
|
||||
new=AsyncMock(side_effect=self._handle),
|
||||
)
|
||||
"""Patch execute_on_client and DB session at all usage sites."""
|
||||
mock_fn = AsyncMock(side_effect=self._handle)
|
||||
targets = [
|
||||
"shared.ws_context.execute_on_client",
|
||||
"app.agent_runner.execute_on_client",
|
||||
"app.agents.filesystem_agent.execute_on_client",
|
||||
]
|
||||
|
||||
# Mock async_session so run_local_agent / _finalize_run skip real DB
|
||||
fake_row = _FakeRow()
|
||||
fake_db = AsyncMock()
|
||||
fake_db.commit = AsyncMock()
|
||||
fake_db.refresh = AsyncMock()
|
||||
fake_db.execute = AsyncMock(return_value=_FakeResult(fake_row))
|
||||
fake_db.add = lambda obj: None # noqa: ARG005
|
||||
|
||||
@asynccontextmanager
|
||||
async def _fake_session():
|
||||
yield fake_db
|
||||
|
||||
patches = [patch(t, new=mock_fn) for t in targets]
|
||||
patches.append(patch("app.agent_runner.async_session", _fake_session))
|
||||
for p in patches:
|
||||
p.start()
|
||||
try:
|
||||
yield mock_fn
|
||||
finally:
|
||||
for p in patches:
|
||||
p.stop()
|
||||
|
||||
# ── Internal dispatch ─────────────────────────────────────────
|
||||
|
||||
|
||||
Reference in New Issue
Block a user