feat(batch-agent): add E2E evaluation harness with Langfuse integration
- eval/mock_executor.py: intercepts execute_on_client, serves fixture files from disk, records all mutations (insert/update/delete) - eval/config.py: YAML fixture loader with prompt variants, expected results, seed records, model overrides - eval/scorer.py: FieldMatchScorer (fuzzy title match, per-field accuracy, precision/recall/F1) + LLMJudgeScorer (semantic eval) - eval/langfuse_eval.py: sync fixtures to Langfuse datasets, create dataset runs, post scores, link traces to runs - eval/runner.py: orchestrates fixture → mock → agent pipeline → scoring → Langfuse reporting - eval/cli.py: CLI (python -m eval run/list/sync) with --models, --variants, --fixture, --no-judge flags - eval/fixtures/: example Italian freelance scenario with 3 prompt variants (baseline, detailed_italian, minimal)
This commit is contained in:
129
services/batch-agent/eval/config.py
Normal file
129
services/batch-agent/eval/config.py
Normal file
@@ -0,0 +1,129 @@
|
||||
"""Eval configuration — YAML fixture loader and dataclasses.
|
||||
|
||||
A *fixture* is a YAML file that defines a complete test scenario:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
name: freelance-invoices
|
||||
description: Extract tasks and notes from invoice PDFs (text layer)
|
||||
directory: sample_files/invoices # relative to fixture dir
|
||||
data_types: [tasks, notes]
|
||||
file_extensions: [txt, md]
|
||||
|
||||
# Preseeded records the agent "sees" as existing data
|
||||
seed_records:
|
||||
projects:
|
||||
- id: proj-1
|
||||
name: "Website Redesign"
|
||||
status: active
|
||||
tasks: []
|
||||
|
||||
# Prompt variations to test (at least one required)
|
||||
prompt_variants:
|
||||
baseline: |
|
||||
Extract action items as tasks and meeting summaries as notes.
|
||||
Set priority based on urgency keywords.
|
||||
detailed: |
|
||||
Extract action items as tasks. Map "URGENT" to high priority,
|
||||
"ASAP" to medium. Summaries become notes with full content.
|
||||
|
||||
# Expected extractions — what the agent SHOULD produce
|
||||
expected:
|
||||
tasks:
|
||||
- title: "Send revised invoice to client"
|
||||
priority: high
|
||||
status: todo
|
||||
- title: "Update project timeline"
|
||||
priority: medium
|
||||
notes:
|
||||
- title: "Meeting summary - March kickoff"
|
||||
|
||||
# Optional: models to test (overrides CLI --models)
|
||||
models: []
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExpectedRecord:
|
||||
"""A single expected extraction result.
|
||||
|
||||
Only the fields specified are checked — unspecified fields are ignored.
|
||||
"""
|
||||
|
||||
table: str # tasks | notes | timelines | projects
|
||||
fields: dict[str, Any] # field_name → expected_value
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvalFixture:
|
||||
"""A complete test scenario loaded from YAML."""
|
||||
|
||||
name: str
|
||||
description: str
|
||||
directory: str # relative path to sample files
|
||||
data_types: list[str]
|
||||
file_extensions: list[str]
|
||||
seed_records: dict[str, list[dict]]
|
||||
prompt_variants: dict[str, str] # variant_name → prompt_template
|
||||
expected: list[ExpectedRecord]
|
||||
models: list[str] # if empty, use CLI default
|
||||
fixture_path: Path = field(default_factory=lambda: Path("."))
|
||||
|
||||
@property
|
||||
def fixture_dir(self) -> Path:
|
||||
"""Absolute path to the sample files directory."""
|
||||
return self.fixture_path.parent / self.directory
|
||||
|
||||
@classmethod
|
||||
def from_yaml(cls, path: Path) -> "EvalFixture":
|
||||
"""Load a fixture from a YAML file."""
|
||||
raw = yaml.safe_load(path.read_text(encoding="utf-8"))
|
||||
|
||||
expected: list[ExpectedRecord] = []
|
||||
for table, records in (raw.get("expected") or {}).items():
|
||||
for rec in records:
|
||||
expected.append(ExpectedRecord(table=table, fields=rec))
|
||||
|
||||
return cls(
|
||||
name=raw["name"],
|
||||
description=raw.get("description", ""),
|
||||
directory=raw.get("directory", "sample_files"),
|
||||
data_types=raw.get("data_types", ["tasks"]),
|
||||
file_extensions=raw.get("file_extensions", []),
|
||||
seed_records=raw.get("seed_records", {}),
|
||||
prompt_variants=raw.get("prompt_variants", {"default": ""}),
|
||||
expected=expected,
|
||||
models=raw.get("models", []),
|
||||
fixture_path=path,
|
||||
)
|
||||
|
||||
|
||||
def discover_fixtures(fixtures_dir: Path | None = None) -> list[EvalFixture]:
|
||||
"""Find and load all YAML fixtures in the fixtures directory."""
|
||||
if fixtures_dir is None:
|
||||
fixtures_dir = Path(__file__).parent / "fixtures"
|
||||
|
||||
fixtures: list[EvalFixture] = []
|
||||
if not fixtures_dir.is_dir():
|
||||
logger.warning("eval: fixtures directory not found: %s", fixtures_dir)
|
||||
return fixtures
|
||||
|
||||
for yaml_path in sorted(fixtures_dir.glob("*.yaml")):
|
||||
try:
|
||||
fixtures.append(EvalFixture.from_yaml(yaml_path))
|
||||
logger.info("eval: loaded fixture %s from %s", fixtures[-1].name, yaml_path.name)
|
||||
except Exception as exc:
|
||||
logger.error("eval: failed to load fixture %s: %s", yaml_path.name, exc)
|
||||
|
||||
return fixtures
|
||||
Reference in New Issue
Block a user