Files
api/services/batch-agent/eval/config.py

221 lines
8.1 KiB
Python

"""Eval configuration — YAML fixture loader and dataclasses.
Fixtures come in two families:
1. **Agent fixtures** — test the batch agent pipeline.
Three modes controlled by ``mode``:
``step1`` — classification prompt only.
``step2`` — processing prompt only.
``full`` — both steps in sequence.
2. **Journey fixtures** — test the prompt-template builder conversation
(unchanged).
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Literal
import yaml
logger = logging.getLogger(__name__)
EvalMode = Literal["step1", "step2", "full"]
@dataclass
class ExpectedRecord:
"""A single expected extraction result.
Only the fields specified are checked — unspecified fields are ignored.
"""
table: str # tasks | notes | timelines | projects
fields: dict[str, Any] # field_name → expected_value
@dataclass
class ExpectedClassification:
"""Expected output of step-1 classification for one file."""
file: str # relative path to the sample file
project_id: str # expected matched project id, or "new"
domains: list[str] # expected domain list
new_project_name: str | None = None
@dataclass
class EvalFixture:
"""A complete test scenario loaded from YAML.
``mode`` determines which pipeline steps are exercised:
- **step1**: only ``_classify_file``
- **step2**: only the processing LLM + tool loop
- **full**: both steps in sequence (``run_local_agent``)
"""
name: str
description: str
mode: EvalMode
directory: str # relative path to sample files
data_types: list[str]
file_extensions: list[str]
models: list[str] # if empty, use CLI default
fixture_path: Path = field(default_factory=lambda: Path("."))
# ── Step-1 inputs (classification) ───────────────────────────
domain_definitions: str = ""
projects_list: list[dict[str, Any]] = field(default_factory=list)
custom_step1_prompt: str = ""
# ── Step-2 inputs (processing) ───────────────────────────────
existing_context: str = ""
project_context: str = ""
custom_prompt_section: str = ""
# ── Seed records for mock executor ───────────────────────────
seed_records: dict[str, list[dict]] = field(default_factory=dict)
# ── Expected outputs ─────────────────────────────────────────
expected_classification: list[ExpectedClassification] = field(default_factory=list)
expected: list[ExpectedRecord] = field(default_factory=list)
@property
def fixture_dir(self) -> Path:
"""Absolute path to the sample files directory."""
return self.fixture_path.parent / self.directory
@classmethod
def from_yaml(cls, path: Path) -> "EvalFixture":
"""Load a fixture from a YAML file."""
raw = yaml.safe_load(path.read_text(encoding="utf-8"))
mode: EvalMode = raw.get("mode", "full")
# Parse expected records (step2/full)
expected: list[ExpectedRecord] = []
for table, records in (raw.get("expected") or {}).items():
for rec in records:
expected.append(ExpectedRecord(table=table, fields=rec))
# Parse expected classification (step1/full)
expected_classification: list[ExpectedClassification] = []
for item in raw.get("expected_classification") or []:
expected_classification.append(ExpectedClassification(
file=item["file"],
project_id=item["project_id"],
domains=item.get("domains", []),
new_project_name=item.get("new_project_name"),
))
return cls(
name=raw["name"],
description=raw.get("description", ""),
mode=mode,
directory=raw.get("directory", "sample_files"),
data_types=raw.get("data_types", ["tasks"]),
file_extensions=raw.get("file_extensions", []),
models=raw.get("models", []),
fixture_path=path,
# Step-1 inputs
domain_definitions=raw.get("domain_definitions", ""),
projects_list=raw.get("projects_list", []),
# Step-2 inputs
existing_context=raw.get("existing_context", ""),
project_context=raw.get("project_context", ""),
custom_prompt_section=raw.get("custom_prompt_section", ""),
# Shared
seed_records=raw.get("seed_records", {}),
expected_classification=expected_classification,
expected=expected,
)
def discover_fixtures(fixtures_dir: Path | None = None) -> list[EvalFixture]:
"""Find and load all YAML fixtures in the fixtures directory."""
if fixtures_dir is None:
fixtures_dir = Path(__file__).parent / "fixtures"
fixtures: list[EvalFixture] = []
if not fixtures_dir.is_dir():
logger.warning("eval: fixtures directory not found: %s", fixtures_dir)
return fixtures
for yaml_path in sorted(fixtures_dir.glob("*.yaml")):
try:
raw = yaml.safe_load(yaml_path.read_text(encoding="utf-8"))
if raw.get("type") == "journey":
continue # Skip journey fixtures
fixtures.append(EvalFixture.from_yaml(yaml_path))
logger.info("eval: loaded fixture %s from %s", fixtures[-1].name, yaml_path.name)
except Exception as exc:
logger.error("eval: failed to load fixture %s: %s", yaml_path.name, exc)
return fixtures
# ── Journey fixtures ─────────────────────────────────────────────────────
@dataclass
class JourneyFixture:
"""A journey test scenario — tests the prompt_template builder conversation."""
name: str
description: str
directory: str # relative path to sample files
data_types: list[str]
expected_template_criteria: list[str] # what the template should contain/satisfy
user_messages: list[str] = field(default_factory=list) # for automated journey runs (unused in interactive mode)
models: list[str] = field(default_factory=list)
fixture_path: Path = field(default_factory=lambda: Path("."))
@property
def fixture_dir(self) -> Path:
"""Absolute path to the sample files directory."""
return self.fixture_path.parent / self.directory
@classmethod
def from_yaml(cls, path: Path) -> "JourneyFixture":
"""Load a journey fixture from a YAML file."""
raw = yaml.safe_load(path.read_text(encoding="utf-8"))
return cls(
name=raw["name"],
description=raw.get("description", ""),
directory=raw.get("directory", "sample_files"),
data_types=raw.get("data_types", ["tasks"]),
user_messages=raw.get("user_messages", []),
expected_template_criteria=raw.get("expected_template_criteria", []),
models=raw.get("models", []),
fixture_path=path,
)
def discover_journey_fixtures(fixtures_dir: Path | None = None) -> list[JourneyFixture]:
"""Find and load all journey YAML fixtures in the fixtures directory."""
if fixtures_dir is None:
fixtures_dir = Path(__file__).parent / "fixtures"
fixtures: list[JourneyFixture] = []
if not fixtures_dir.is_dir():
logger.warning("eval: fixtures directory not found: %s", fixtures_dir)
return fixtures
for yaml_path in sorted(fixtures_dir.glob("*.yaml")):
try:
raw = yaml.safe_load(yaml_path.read_text(encoding="utf-8"))
if raw.get("type") != "journey":
continue
fixtures.append(JourneyFixture.from_yaml(yaml_path))
logger.info("eval: loaded journey fixture %s from %s", fixtures[-1].name, yaml_path.name)
except Exception as exc:
logger.error("eval: failed to load journey fixture %s: %s", yaml_path.name, exc)
return fixtures