feat(batch-agent): add E2E evaluation harness with Langfuse integration
- eval/mock_executor.py: intercepts execute_on_client, serves fixture files from disk, records all mutations (insert/update/delete) - eval/config.py: YAML fixture loader with prompt variants, expected results, seed records, model overrides - eval/scorer.py: FieldMatchScorer (fuzzy title match, per-field accuracy, precision/recall/F1) + LLMJudgeScorer (semantic eval) - eval/langfuse_eval.py: sync fixtures to Langfuse datasets, create dataset runs, post scores, link traces to runs - eval/runner.py: orchestrates fixture → mock → agent pipeline → scoring → Langfuse reporting - eval/cli.py: CLI (python -m eval run/list/sync) with --models, --variants, --fixture, --no-judge flags - eval/fixtures/: example Italian freelance scenario with 3 prompt variants (baseline, detailed_italian, minimal)
This commit is contained in:
208
services/batch-agent/eval/mock_executor.py
Normal file
208
services/batch-agent/eval/mock_executor.py
Normal file
@@ -0,0 +1,208 @@
|
||||
"""Mock executor — intercepts execute_on_client for offline E2E testing.
|
||||
|
||||
Patches ``app.ws_context.execute_on_client`` so agent pipeline runs don't
|
||||
require a live Electron client or Redis. Instead:
|
||||
|
||||
- **Filesystem actions** (list_directory, read_file_content, get_file_metadata)
|
||||
are served from local fixture files on disk.
|
||||
- **Read actions** (select, get) return preseeded records from an in-memory
|
||||
store provided by the test fixture.
|
||||
- **Write actions** (insert, update, delete) are captured as *mutations* and
|
||||
stored for later comparison against expected results.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from unittest.mock import AsyncMock, patch
|
||||
|
||||
|
||||
@dataclass
|
||||
class Mutation:
|
||||
"""A single recorded write operation."""
|
||||
|
||||
action: str # insert | update | delete
|
||||
table: str
|
||||
data: dict[str, Any]
|
||||
timestamp: float = field(default_factory=time.time)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockExecutor:
|
||||
"""In-memory executor that replaces Redis-based tool round-trip.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fixture_dir : Path
|
||||
Directory containing sample files for filesystem tool calls.
|
||||
seed_records : dict[str, list[dict]]
|
||||
Pre-existing records per table, e.g. ``{"tasks": [...], "projects": [...]}``.
|
||||
The executor returns these for ``select`` / ``get`` actions and auto-updates
|
||||
them on ``insert`` / ``update`` / ``delete`` so subsequent selects reflect changes.
|
||||
"""
|
||||
|
||||
fixture_dir: Path
|
||||
seed_records: dict[str, list[dict]] = field(default_factory=dict)
|
||||
mutations: list[Mutation] = field(default_factory=list)
|
||||
_id_counter: int = field(default=1000, repr=False)
|
||||
|
||||
# ── Public API ───────────────────────────────────────────────────
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Clear recorded mutations (keep seed_records intact)."""
|
||||
self.mutations.clear()
|
||||
|
||||
def get_mutations(self, *, table: str | None = None, action: str | None = None) -> list[Mutation]:
|
||||
"""Filter mutations by table and/or action."""
|
||||
result = self.mutations
|
||||
if table:
|
||||
result = [m for m in result if m.table == table]
|
||||
if action:
|
||||
result = [m for m in result if m.action == action]
|
||||
return result
|
||||
|
||||
def created_records(self, table: str) -> list[dict]:
|
||||
"""Return data dicts of all inserts into *table*."""
|
||||
return [m.data for m in self.mutations if m.table == table and m.action == "insert"]
|
||||
|
||||
def updated_records(self, table: str) -> list[dict]:
|
||||
"""Return data dicts of all updates to *table*."""
|
||||
return [m.data for m in self.mutations if m.table == table and m.action == "update"]
|
||||
|
||||
# ── Context manager for patching ──────────────────────────────
|
||||
|
||||
def patch(self):
|
||||
"""Return an async context-manager that patches execute_on_client."""
|
||||
return patch(
|
||||
"app.ws_context.execute_on_client",
|
||||
new=AsyncMock(side_effect=self._handle),
|
||||
)
|
||||
|
||||
# ── Internal dispatch ─────────────────────────────────────────
|
||||
|
||||
async def _handle(
|
||||
self,
|
||||
action: str,
|
||||
table: str | None = None,
|
||||
data: dict[str, Any] | None = None,
|
||||
filters: dict[str, Any] | None = None,
|
||||
vector: list[float] | None = None,
|
||||
limit: int | None = None,
|
||||
) -> dict[str, Any]:
|
||||
# Filesystem
|
||||
if action == "list_directory":
|
||||
return self._list_directory(data or {})
|
||||
if action == "read_file_content":
|
||||
return self._read_file(data or {})
|
||||
if action == "get_file_metadata":
|
||||
return self._get_file_metadata(data or {})
|
||||
|
||||
# CRUD
|
||||
if action == "select":
|
||||
return self._select(table or "", filters)
|
||||
if action == "get":
|
||||
return self._get(table or "", data or {})
|
||||
if action == "insert":
|
||||
return self._insert(table or "", data or {})
|
||||
if action == "update":
|
||||
return self._update(table or "", data or {})
|
||||
if action == "delete":
|
||||
return self._delete(table or "", data or {})
|
||||
|
||||
# Vector (no-op for eval)
|
||||
if action in ("vector_upsert", "vector_search"):
|
||||
return {"rows": []}
|
||||
|
||||
return {"error": f"Unknown action: {action}"}
|
||||
|
||||
# ── Filesystem handlers ───────────────────────────────────────
|
||||
|
||||
def _list_directory(self, data: dict) -> dict:
|
||||
rel_path = data.get("path", "")
|
||||
abs_path = self.fixture_dir / rel_path.lstrip("/\\")
|
||||
if not abs_path.is_dir():
|
||||
return {"entries": []}
|
||||
entries: list[dict] = []
|
||||
for child in sorted(abs_path.iterdir()):
|
||||
entry_type = "directory" if child.is_dir() else "file"
|
||||
# Return paths relative to fixture_dir but with the original prefix
|
||||
entry_path = rel_path.rstrip("/\\") + "/" + child.name
|
||||
entries.append({
|
||||
"name": child.name,
|
||||
"path": entry_path,
|
||||
"type": entry_type,
|
||||
})
|
||||
return {"entries": entries}
|
||||
|
||||
def _read_file(self, data: dict) -> dict:
|
||||
rel_path = data.get("path", "")
|
||||
abs_path = self.fixture_dir / rel_path.lstrip("/\\")
|
||||
if not abs_path.is_file():
|
||||
return {"content": "", "error": f"File not found: {rel_path}"}
|
||||
return {"content": abs_path.read_text(encoding="utf-8", errors="replace")}
|
||||
|
||||
def _get_file_metadata(self, data: dict) -> dict:
|
||||
rel_path = data.get("path", "")
|
||||
abs_path = self.fixture_dir / rel_path.lstrip("/\\")
|
||||
if not abs_path.exists():
|
||||
return {"error": f"Not found: {rel_path}"}
|
||||
stat = abs_path.stat()
|
||||
return {
|
||||
"path": rel_path,
|
||||
"size": stat.st_size,
|
||||
"modifiedAt": int(stat.st_mtime * 1000),
|
||||
"createdAt": int(stat.st_ctime * 1000),
|
||||
"isDirectory": abs_path.is_dir(),
|
||||
}
|
||||
|
||||
# ── CRUD handlers ─────────────────────────────────────────────
|
||||
|
||||
def _select(self, table: str, filters: dict | None) -> dict:
|
||||
rows = list(self.seed_records.get(table, []))
|
||||
if filters:
|
||||
rows = [
|
||||
r for r in rows
|
||||
if all(r.get(k) == v for k, v in filters.items() if v is not None)
|
||||
]
|
||||
return {"rows": rows}
|
||||
|
||||
def _get(self, table: str, data: dict) -> dict:
|
||||
record_id = data.get("id", "")
|
||||
rows = self.seed_records.get(table, [])
|
||||
for r in rows:
|
||||
if r.get("id") == record_id:
|
||||
return {"row": r}
|
||||
return {"row": None}
|
||||
|
||||
def _insert(self, table: str, data: dict) -> dict:
|
||||
self._id_counter += 1
|
||||
record = {**data, "id": str(self._id_counter)}
|
||||
# Add to seed so subsequent selects can find it
|
||||
self.seed_records.setdefault(table, []).append(record)
|
||||
self.mutations.append(Mutation(action="insert", table=table, data=record))
|
||||
return {"row": record}
|
||||
|
||||
def _update(self, table: str, data: dict) -> dict:
|
||||
record_id = data.get("id", "")
|
||||
rows = self.seed_records.get(table, [])
|
||||
for r in rows:
|
||||
if r.get("id") == record_id:
|
||||
r.update({k: v for k, v in data.items() if v is not None and v != ""})
|
||||
self.mutations.append(Mutation(action="update", table=table, data=dict(r)))
|
||||
return {"row": r}
|
||||
# Record not found — still log the mutation
|
||||
self.mutations.append(Mutation(action="update", table=table, data=data))
|
||||
return {"row": data}
|
||||
|
||||
def _delete(self, table: str, data: dict) -> dict:
|
||||
record_id = data.get("id", "")
|
||||
rows = self.seed_records.get(table, [])
|
||||
self.seed_records[table] = [r for r in rows if r.get("id") != record_id]
|
||||
self.mutations.append(Mutation(action="delete", table=table, data={"id": record_id}))
|
||||
return {"deleted": True}
|
||||
Reference in New Issue
Block a user