- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants - Rewrite runner with _run_step1, _run_step2, _run_full dispatch - CLI: replace --variants with --mode flag - Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full) - Remove old freelance_invoices fixture - Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full) - Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode - Langfuse: post separate classification_precision/recall/f1 scores for full mode - Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1) - Langfuse: include step1_results in trace output - MockExecutor: mock async_session to bypass DB in full mode - Journey fixture: remove user_messages (only interactive test kept)
259 lines
9.7 KiB
Python
259 lines
9.7 KiB
Python
"""Mock executor — intercepts execute_on_client for offline E2E testing.
|
|
|
|
Patches ``execute_on_client`` at all usage sites so agent pipeline runs don't
|
|
require a live Electron client or Redis. Instead:
|
|
|
|
- **Filesystem actions** (list_directory, read_file_content, get_file_metadata)
|
|
are served from local fixture files on disk.
|
|
- **Read actions** (select, get) return preseeded records from an in-memory
|
|
store provided by the test fixture.
|
|
- **Write actions** (insert, update, delete) are captured as *mutations* and
|
|
stored for later comparison against expected results.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import time
|
|
import uuid
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import Any
|
|
from contextlib import contextmanager, asynccontextmanager
|
|
from unittest.mock import AsyncMock, patch
|
|
|
|
|
|
@dataclass
|
|
class Mutation:
|
|
"""A single recorded write operation."""
|
|
|
|
action: str # insert | update | delete
|
|
table: str
|
|
data: dict[str, Any]
|
|
timestamp: float = field(default_factory=time.time)
|
|
|
|
|
|
# ── Fake DB helpers (used to bypass async_session in full mode) ───────
|
|
|
|
class _FakeRow:
|
|
"""Mimics an AgentRunLog row returned by SQLAlchemy."""
|
|
id = 0
|
|
status = "running"
|
|
items_processed = 0
|
|
items_created = 0
|
|
errors: list[str] = []
|
|
completed_at = None
|
|
|
|
def __setattr__(self, name: str, value: Any) -> None:
|
|
object.__setattr__(self, name, value)
|
|
|
|
|
|
class _FakeResult:
|
|
"""Mimics a SQLAlchemy ``Result`` with ``scalar_one_or_none``."""
|
|
def __init__(self, row: _FakeRow) -> None:
|
|
self._row = row
|
|
|
|
def scalar_one_or_none(self) -> _FakeRow:
|
|
return self._row
|
|
|
|
|
|
@dataclass
|
|
class MockExecutor:
|
|
"""In-memory executor that replaces Redis-based tool round-trip.
|
|
|
|
Parameters
|
|
----------
|
|
fixture_dir : Path
|
|
Directory containing sample files for filesystem tool calls.
|
|
seed_records : dict[str, list[dict]]
|
|
Pre-existing records per table, e.g. ``{"tasks": [...], "projects": [...]}``.
|
|
The executor returns these for ``select`` / ``get`` actions and auto-updates
|
|
them on ``insert`` / ``update`` / ``delete`` so subsequent selects reflect changes.
|
|
"""
|
|
|
|
fixture_dir: Path
|
|
seed_records: dict[str, list[dict]] = field(default_factory=dict)
|
|
mutations: list[Mutation] = field(default_factory=list)
|
|
_id_counter: int = field(default=1000, repr=False)
|
|
|
|
# ── Public API ───────────────────────────────────────────────────
|
|
|
|
def reset(self) -> None:
|
|
"""Clear recorded mutations (keep seed_records intact)."""
|
|
self.mutations.clear()
|
|
|
|
def get_mutations(self, *, table: str | None = None, action: str | None = None) -> list[Mutation]:
|
|
"""Filter mutations by table and/or action."""
|
|
result = self.mutations
|
|
if table:
|
|
result = [m for m in result if m.table == table]
|
|
if action:
|
|
result = [m for m in result if m.action == action]
|
|
return result
|
|
|
|
def created_records(self, table: str) -> list[dict]:
|
|
"""Return data dicts of all inserts into *table*."""
|
|
return [m.data for m in self.mutations if m.table == table and m.action == "insert"]
|
|
|
|
def updated_records(self, table: str) -> list[dict]:
|
|
"""Return data dicts of all updates to *table*."""
|
|
return [m.data for m in self.mutations if m.table == table and m.action == "update"]
|
|
|
|
# ── Context manager for patching ──────────────────────────────
|
|
|
|
@contextmanager
|
|
def patch(self):
|
|
"""Patch execute_on_client and DB session at all usage sites."""
|
|
mock_fn = AsyncMock(side_effect=self._handle)
|
|
targets = [
|
|
"shared.ws_context.execute_on_client",
|
|
"app.agent_runner.execute_on_client",
|
|
"app.agents.filesystem_agent.execute_on_client",
|
|
]
|
|
|
|
# Mock async_session so run_local_agent / _finalize_run skip real DB
|
|
fake_row = _FakeRow()
|
|
fake_db = AsyncMock()
|
|
fake_db.commit = AsyncMock()
|
|
fake_db.refresh = AsyncMock()
|
|
fake_db.execute = AsyncMock(return_value=_FakeResult(fake_row))
|
|
fake_db.add = lambda obj: None # noqa: ARG005
|
|
|
|
@asynccontextmanager
|
|
async def _fake_session():
|
|
yield fake_db
|
|
|
|
patches = [patch(t, new=mock_fn) for t in targets]
|
|
patches.append(patch("app.agent_runner.async_session", _fake_session))
|
|
for p in patches:
|
|
p.start()
|
|
try:
|
|
yield mock_fn
|
|
finally:
|
|
for p in patches:
|
|
p.stop()
|
|
|
|
# ── Internal dispatch ─────────────────────────────────────────
|
|
|
|
async def _handle(
|
|
self,
|
|
action: str,
|
|
table: str | None = None,
|
|
data: dict[str, Any] | None = None,
|
|
filters: dict[str, Any] | None = None,
|
|
vector: list[float] | None = None,
|
|
limit: int | None = None,
|
|
) -> dict[str, Any]:
|
|
# Filesystem
|
|
if action == "list_directory":
|
|
return self._list_directory(data or {})
|
|
if action == "read_file_content":
|
|
return self._read_file(data or {})
|
|
if action == "get_file_metadata":
|
|
return self._get_file_metadata(data or {})
|
|
|
|
# CRUD
|
|
if action == "select":
|
|
return self._select(table or "", filters)
|
|
if action == "get":
|
|
return self._get(table or "", data or {})
|
|
if action == "insert":
|
|
return self._insert(table or "", data or {})
|
|
if action == "update":
|
|
return self._update(table or "", data or {})
|
|
if action == "delete":
|
|
return self._delete(table or "", data or {})
|
|
|
|
# Vector (no-op for eval)
|
|
if action in ("vector_upsert", "vector_search"):
|
|
return {"rows": []}
|
|
|
|
return {"error": f"Unknown action: {action}"}
|
|
|
|
# ── Filesystem handlers ───────────────────────────────────────
|
|
|
|
def _list_directory(self, data: dict) -> dict:
|
|
rel_path = data.get("path", "")
|
|
abs_path = self.fixture_dir / rel_path.lstrip("/\\")
|
|
if not abs_path.is_dir():
|
|
return {"entries": []}
|
|
entries: list[dict] = []
|
|
for child in sorted(abs_path.iterdir()):
|
|
entry_type = "directory" if child.is_dir() else "file"
|
|
# Return paths relative to fixture_dir but with the original prefix
|
|
entry_path = rel_path.rstrip("/\\") + "/" + child.name
|
|
entries.append({
|
|
"name": child.name,
|
|
"path": entry_path,
|
|
"type": entry_type,
|
|
})
|
|
return {"entries": entries}
|
|
|
|
def _read_file(self, data: dict) -> dict:
|
|
rel_path = data.get("path", "")
|
|
abs_path = self.fixture_dir / rel_path.lstrip("/\\")
|
|
if not abs_path.is_file():
|
|
return {"content": "", "error": f"File not found: {rel_path}"}
|
|
return {"content": abs_path.read_text(encoding="utf-8", errors="replace")}
|
|
|
|
def _get_file_metadata(self, data: dict) -> dict:
|
|
rel_path = data.get("path", "")
|
|
abs_path = self.fixture_dir / rel_path.lstrip("/\\")
|
|
if not abs_path.exists():
|
|
return {"error": f"Not found: {rel_path}"}
|
|
stat = abs_path.stat()
|
|
return {
|
|
"path": rel_path,
|
|
"size": stat.st_size,
|
|
"modifiedAt": int(stat.st_mtime * 1000),
|
|
"createdAt": int(stat.st_ctime * 1000),
|
|
"isDirectory": abs_path.is_dir(),
|
|
}
|
|
|
|
# ── CRUD handlers ─────────────────────────────────────────────
|
|
|
|
def _select(self, table: str, filters: dict | None) -> dict:
|
|
rows = list(self.seed_records.get(table, []))
|
|
if filters:
|
|
rows = [
|
|
r for r in rows
|
|
if all(r.get(k) == v for k, v in filters.items() if v is not None)
|
|
]
|
|
return {"rows": rows}
|
|
|
|
def _get(self, table: str, data: dict) -> dict:
|
|
record_id = data.get("id", "")
|
|
rows = self.seed_records.get(table, [])
|
|
for r in rows:
|
|
if r.get("id") == record_id:
|
|
return {"row": r}
|
|
return {"row": None}
|
|
|
|
def _insert(self, table: str, data: dict) -> dict:
|
|
self._id_counter += 1
|
|
record = {**data, "id": str(self._id_counter)}
|
|
# Add to seed so subsequent selects can find it
|
|
self.seed_records.setdefault(table, []).append(record)
|
|
self.mutations.append(Mutation(action="insert", table=table, data=record))
|
|
return {"row": record}
|
|
|
|
def _update(self, table: str, data: dict) -> dict:
|
|
record_id = data.get("id", "")
|
|
rows = self.seed_records.get(table, [])
|
|
for r in rows:
|
|
if r.get("id") == record_id:
|
|
r.update({k: v for k, v in data.items() if v is not None and v != ""})
|
|
self.mutations.append(Mutation(action="update", table=table, data=dict(r)))
|
|
return {"row": r}
|
|
# Record not found — still log the mutation
|
|
self.mutations.append(Mutation(action="update", table=table, data=data))
|
|
return {"row": data}
|
|
|
|
def _delete(self, table: str, data: dict) -> dict:
|
|
record_id = data.get("id", "")
|
|
rows = self.seed_records.get(table, [])
|
|
self.seed_records[table] = [r for r in rows if r.get("id") != record_id]
|
|
self.mutations.append(Mutation(action="delete", table=table, data={"id": record_id}))
|
|
return {"deleted": True}
|