Files
api/services/batch-agent/eval/mock_executor.py
Roberto Musso d3f7099d93 refactor(eval): 3-mode eval harness (step1/step2/full) with Langfuse fixes
- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants
- Rewrite runner with _run_step1, _run_step2, _run_full dispatch
- CLI: replace --variants with --mode flag
- Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full)
- Remove old freelance_invoices fixture
- Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full)
- Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode
- Langfuse: post separate classification_precision/recall/f1 scores for full mode
- Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1)
- Langfuse: include step1_results in trace output
- MockExecutor: mock async_session to bypass DB in full mode
- Journey fixture: remove user_messages (only interactive test kept)
2026-03-24 16:18:51 +01:00

259 lines
9.7 KiB
Python

"""Mock executor — intercepts execute_on_client for offline E2E testing.
Patches ``execute_on_client`` at all usage sites so agent pipeline runs don't
require a live Electron client or Redis. Instead:
- **Filesystem actions** (list_directory, read_file_content, get_file_metadata)
are served from local fixture files on disk.
- **Read actions** (select, get) return preseeded records from an in-memory
store provided by the test fixture.
- **Write actions** (insert, update, delete) are captured as *mutations* and
stored for later comparison against expected results.
"""
from __future__ import annotations
import json
import os
import time
import uuid
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
from contextlib import contextmanager, asynccontextmanager
from unittest.mock import AsyncMock, patch
@dataclass
class Mutation:
"""A single recorded write operation."""
action: str # insert | update | delete
table: str
data: dict[str, Any]
timestamp: float = field(default_factory=time.time)
# ── Fake DB helpers (used to bypass async_session in full mode) ───────
class _FakeRow:
"""Mimics an AgentRunLog row returned by SQLAlchemy."""
id = 0
status = "running"
items_processed = 0
items_created = 0
errors: list[str] = []
completed_at = None
def __setattr__(self, name: str, value: Any) -> None:
object.__setattr__(self, name, value)
class _FakeResult:
"""Mimics a SQLAlchemy ``Result`` with ``scalar_one_or_none``."""
def __init__(self, row: _FakeRow) -> None:
self._row = row
def scalar_one_or_none(self) -> _FakeRow:
return self._row
@dataclass
class MockExecutor:
"""In-memory executor that replaces Redis-based tool round-trip.
Parameters
----------
fixture_dir : Path
Directory containing sample files for filesystem tool calls.
seed_records : dict[str, list[dict]]
Pre-existing records per table, e.g. ``{"tasks": [...], "projects": [...]}``.
The executor returns these for ``select`` / ``get`` actions and auto-updates
them on ``insert`` / ``update`` / ``delete`` so subsequent selects reflect changes.
"""
fixture_dir: Path
seed_records: dict[str, list[dict]] = field(default_factory=dict)
mutations: list[Mutation] = field(default_factory=list)
_id_counter: int = field(default=1000, repr=False)
# ── Public API ───────────────────────────────────────────────────
def reset(self) -> None:
"""Clear recorded mutations (keep seed_records intact)."""
self.mutations.clear()
def get_mutations(self, *, table: str | None = None, action: str | None = None) -> list[Mutation]:
"""Filter mutations by table and/or action."""
result = self.mutations
if table:
result = [m for m in result if m.table == table]
if action:
result = [m for m in result if m.action == action]
return result
def created_records(self, table: str) -> list[dict]:
"""Return data dicts of all inserts into *table*."""
return [m.data for m in self.mutations if m.table == table and m.action == "insert"]
def updated_records(self, table: str) -> list[dict]:
"""Return data dicts of all updates to *table*."""
return [m.data for m in self.mutations if m.table == table and m.action == "update"]
# ── Context manager for patching ──────────────────────────────
@contextmanager
def patch(self):
"""Patch execute_on_client and DB session at all usage sites."""
mock_fn = AsyncMock(side_effect=self._handle)
targets = [
"shared.ws_context.execute_on_client",
"app.agent_runner.execute_on_client",
"app.agents.filesystem_agent.execute_on_client",
]
# Mock async_session so run_local_agent / _finalize_run skip real DB
fake_row = _FakeRow()
fake_db = AsyncMock()
fake_db.commit = AsyncMock()
fake_db.refresh = AsyncMock()
fake_db.execute = AsyncMock(return_value=_FakeResult(fake_row))
fake_db.add = lambda obj: None # noqa: ARG005
@asynccontextmanager
async def _fake_session():
yield fake_db
patches = [patch(t, new=mock_fn) for t in targets]
patches.append(patch("app.agent_runner.async_session", _fake_session))
for p in patches:
p.start()
try:
yield mock_fn
finally:
for p in patches:
p.stop()
# ── Internal dispatch ─────────────────────────────────────────
async def _handle(
self,
action: str,
table: str | None = None,
data: dict[str, Any] | None = None,
filters: dict[str, Any] | None = None,
vector: list[float] | None = None,
limit: int | None = None,
) -> dict[str, Any]:
# Filesystem
if action == "list_directory":
return self._list_directory(data or {})
if action == "read_file_content":
return self._read_file(data or {})
if action == "get_file_metadata":
return self._get_file_metadata(data or {})
# CRUD
if action == "select":
return self._select(table or "", filters)
if action == "get":
return self._get(table or "", data or {})
if action == "insert":
return self._insert(table or "", data or {})
if action == "update":
return self._update(table or "", data or {})
if action == "delete":
return self._delete(table or "", data or {})
# Vector (no-op for eval)
if action in ("vector_upsert", "vector_search"):
return {"rows": []}
return {"error": f"Unknown action: {action}"}
# ── Filesystem handlers ───────────────────────────────────────
def _list_directory(self, data: dict) -> dict:
rel_path = data.get("path", "")
abs_path = self.fixture_dir / rel_path.lstrip("/\\")
if not abs_path.is_dir():
return {"entries": []}
entries: list[dict] = []
for child in sorted(abs_path.iterdir()):
entry_type = "directory" if child.is_dir() else "file"
# Return paths relative to fixture_dir but with the original prefix
entry_path = rel_path.rstrip("/\\") + "/" + child.name
entries.append({
"name": child.name,
"path": entry_path,
"type": entry_type,
})
return {"entries": entries}
def _read_file(self, data: dict) -> dict:
rel_path = data.get("path", "")
abs_path = self.fixture_dir / rel_path.lstrip("/\\")
if not abs_path.is_file():
return {"content": "", "error": f"File not found: {rel_path}"}
return {"content": abs_path.read_text(encoding="utf-8", errors="replace")}
def _get_file_metadata(self, data: dict) -> dict:
rel_path = data.get("path", "")
abs_path = self.fixture_dir / rel_path.lstrip("/\\")
if not abs_path.exists():
return {"error": f"Not found: {rel_path}"}
stat = abs_path.stat()
return {
"path": rel_path,
"size": stat.st_size,
"modifiedAt": int(stat.st_mtime * 1000),
"createdAt": int(stat.st_ctime * 1000),
"isDirectory": abs_path.is_dir(),
}
# ── CRUD handlers ─────────────────────────────────────────────
def _select(self, table: str, filters: dict | None) -> dict:
rows = list(self.seed_records.get(table, []))
if filters:
rows = [
r for r in rows
if all(r.get(k) == v for k, v in filters.items() if v is not None)
]
return {"rows": rows}
def _get(self, table: str, data: dict) -> dict:
record_id = data.get("id", "")
rows = self.seed_records.get(table, [])
for r in rows:
if r.get("id") == record_id:
return {"row": r}
return {"row": None}
def _insert(self, table: str, data: dict) -> dict:
self._id_counter += 1
record = {**data, "id": str(self._id_counter)}
# Add to seed so subsequent selects can find it
self.seed_records.setdefault(table, []).append(record)
self.mutations.append(Mutation(action="insert", table=table, data=record))
return {"row": record}
def _update(self, table: str, data: dict) -> dict:
record_id = data.get("id", "")
rows = self.seed_records.get(table, [])
for r in rows:
if r.get("id") == record_id:
r.update({k: v for k, v in data.items() if v is not None and v != ""})
self.mutations.append(Mutation(action="update", table=table, data=dict(r)))
return {"row": r}
# Record not found — still log the mutation
self.mutations.append(Mutation(action="update", table=table, data=data))
return {"row": data}
def _delete(self, table: str, data: dict) -> dict:
record_id = data.get("id", "")
rows = self.seed_records.get(table, [])
self.seed_records[table] = [r for r in rows if r.get("id") != record_id]
self.mutations.append(Mutation(action="delete", table=table, data={"id": record_id}))
return {"deleted": True}