221 lines
8.1 KiB
Python
221 lines
8.1 KiB
Python
"""Eval configuration — YAML fixture loader and dataclasses.
|
|
|
|
Fixtures come in two families:
|
|
|
|
1. **Agent fixtures** — test the batch agent pipeline.
|
|
Three modes controlled by ``mode``:
|
|
|
|
``step1`` — classification prompt only.
|
|
``step2`` — processing prompt only.
|
|
``full`` — both steps in sequence.
|
|
|
|
2. **Journey fixtures** — test the prompt-template builder conversation
|
|
(unchanged).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import Any, Literal
|
|
|
|
import yaml
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
EvalMode = Literal["step1", "step2", "full"]
|
|
|
|
|
|
@dataclass
|
|
class ExpectedRecord:
|
|
"""A single expected extraction result.
|
|
|
|
Only the fields specified are checked — unspecified fields are ignored.
|
|
"""
|
|
|
|
table: str # tasks | notes | timelines | projects
|
|
fields: dict[str, Any] # field_name → expected_value
|
|
|
|
|
|
@dataclass
|
|
class ExpectedClassification:
|
|
"""Expected output of step-1 classification for one file."""
|
|
|
|
file: str # relative path to the sample file
|
|
project_id: str # expected matched project id, or "new"
|
|
domains: list[str] # expected domain list
|
|
new_project_name: str | None = None
|
|
|
|
|
|
@dataclass
|
|
class EvalFixture:
|
|
"""A complete test scenario loaded from YAML.
|
|
|
|
``mode`` determines which pipeline steps are exercised:
|
|
|
|
- **step1**: only ``_classify_file``
|
|
- **step2**: only the processing LLM + tool loop
|
|
- **full**: both steps in sequence (``run_local_agent``)
|
|
"""
|
|
|
|
name: str
|
|
description: str
|
|
mode: EvalMode
|
|
directory: str # relative path to sample files
|
|
data_types: list[str]
|
|
file_extensions: list[str]
|
|
models: list[str] # if empty, use CLI default
|
|
fixture_path: Path = field(default_factory=lambda: Path("."))
|
|
|
|
# ── Step-1 inputs (classification) ───────────────────────────
|
|
domain_definitions: str = ""
|
|
projects_list: list[dict[str, Any]] = field(default_factory=list)
|
|
custom_step1_prompt: str = ""
|
|
|
|
# ── Step-2 inputs (processing) ───────────────────────────────
|
|
existing_context: str = ""
|
|
project_context: str = ""
|
|
custom_prompt_section: str = ""
|
|
|
|
# ── Seed records for mock executor ───────────────────────────
|
|
seed_records: dict[str, list[dict]] = field(default_factory=dict)
|
|
|
|
# ── Expected outputs ─────────────────────────────────────────
|
|
expected_classification: list[ExpectedClassification] = field(default_factory=list)
|
|
expected: list[ExpectedRecord] = field(default_factory=list)
|
|
|
|
@property
|
|
def fixture_dir(self) -> Path:
|
|
"""Absolute path to the sample files directory."""
|
|
return self.fixture_path.parent / self.directory
|
|
|
|
@classmethod
|
|
def from_yaml(cls, path: Path) -> "EvalFixture":
|
|
"""Load a fixture from a YAML file."""
|
|
raw = yaml.safe_load(path.read_text(encoding="utf-8"))
|
|
|
|
mode: EvalMode = raw.get("mode", "full")
|
|
|
|
# Parse expected records (step2/full)
|
|
expected: list[ExpectedRecord] = []
|
|
for table, records in (raw.get("expected") or {}).items():
|
|
for rec in records:
|
|
expected.append(ExpectedRecord(table=table, fields=rec))
|
|
|
|
# Parse expected classification (step1/full)
|
|
expected_classification: list[ExpectedClassification] = []
|
|
for item in raw.get("expected_classification") or []:
|
|
expected_classification.append(ExpectedClassification(
|
|
file=item["file"],
|
|
project_id=item["project_id"],
|
|
domains=item.get("domains", []),
|
|
new_project_name=item.get("new_project_name"),
|
|
))
|
|
|
|
return cls(
|
|
name=raw["name"],
|
|
description=raw.get("description", ""),
|
|
mode=mode,
|
|
directory=raw.get("directory", "sample_files"),
|
|
data_types=raw.get("data_types", ["tasks"]),
|
|
file_extensions=raw.get("file_extensions", []),
|
|
models=raw.get("models", []),
|
|
fixture_path=path,
|
|
# Step-1 inputs
|
|
domain_definitions=raw.get("domain_definitions", ""),
|
|
projects_list=raw.get("projects_list", []),
|
|
# Step-2 inputs
|
|
existing_context=raw.get("existing_context", ""),
|
|
project_context=raw.get("project_context", ""),
|
|
custom_prompt_section=raw.get("custom_prompt_section", ""),
|
|
# Shared
|
|
seed_records=raw.get("seed_records", {}),
|
|
expected_classification=expected_classification,
|
|
expected=expected,
|
|
)
|
|
|
|
|
|
def discover_fixtures(fixtures_dir: Path | None = None) -> list[EvalFixture]:
|
|
"""Find and load all YAML fixtures in the fixtures directory."""
|
|
if fixtures_dir is None:
|
|
fixtures_dir = Path(__file__).parent / "fixtures"
|
|
|
|
fixtures: list[EvalFixture] = []
|
|
if not fixtures_dir.is_dir():
|
|
logger.warning("eval: fixtures directory not found: %s", fixtures_dir)
|
|
return fixtures
|
|
|
|
for yaml_path in sorted(fixtures_dir.glob("*.yaml")):
|
|
try:
|
|
raw = yaml.safe_load(yaml_path.read_text(encoding="utf-8"))
|
|
if raw.get("type") == "journey":
|
|
continue # Skip journey fixtures
|
|
fixtures.append(EvalFixture.from_yaml(yaml_path))
|
|
logger.info("eval: loaded fixture %s from %s", fixtures[-1].name, yaml_path.name)
|
|
except Exception as exc:
|
|
logger.error("eval: failed to load fixture %s: %s", yaml_path.name, exc)
|
|
|
|
return fixtures
|
|
|
|
|
|
# ── Journey fixtures ─────────────────────────────────────────────────────
|
|
|
|
|
|
@dataclass
|
|
class JourneyFixture:
|
|
"""A journey test scenario — tests the prompt_template builder conversation."""
|
|
|
|
name: str
|
|
description: str
|
|
directory: str # relative path to sample files
|
|
data_types: list[str]
|
|
expected_template_criteria: list[str] # what the template should contain/satisfy
|
|
user_messages: list[str] = field(default_factory=list) # for automated journey runs (unused in interactive mode)
|
|
models: list[str] = field(default_factory=list)
|
|
fixture_path: Path = field(default_factory=lambda: Path("."))
|
|
|
|
@property
|
|
def fixture_dir(self) -> Path:
|
|
"""Absolute path to the sample files directory."""
|
|
return self.fixture_path.parent / self.directory
|
|
|
|
@classmethod
|
|
def from_yaml(cls, path: Path) -> "JourneyFixture":
|
|
"""Load a journey fixture from a YAML file."""
|
|
raw = yaml.safe_load(path.read_text(encoding="utf-8"))
|
|
|
|
return cls(
|
|
name=raw["name"],
|
|
description=raw.get("description", ""),
|
|
directory=raw.get("directory", "sample_files"),
|
|
data_types=raw.get("data_types", ["tasks"]),
|
|
user_messages=raw.get("user_messages", []),
|
|
expected_template_criteria=raw.get("expected_template_criteria", []),
|
|
models=raw.get("models", []),
|
|
fixture_path=path,
|
|
)
|
|
|
|
|
|
def discover_journey_fixtures(fixtures_dir: Path | None = None) -> list[JourneyFixture]:
|
|
"""Find and load all journey YAML fixtures in the fixtures directory."""
|
|
if fixtures_dir is None:
|
|
fixtures_dir = Path(__file__).parent / "fixtures"
|
|
|
|
fixtures: list[JourneyFixture] = []
|
|
if not fixtures_dir.is_dir():
|
|
logger.warning("eval: fixtures directory not found: %s", fixtures_dir)
|
|
return fixtures
|
|
|
|
for yaml_path in sorted(fixtures_dir.glob("*.yaml")):
|
|
try:
|
|
raw = yaml.safe_load(yaml_path.read_text(encoding="utf-8"))
|
|
if raw.get("type") != "journey":
|
|
continue
|
|
fixtures.append(JourneyFixture.from_yaml(yaml_path))
|
|
logger.info("eval: loaded journey fixture %s from %s", fixtures[-1].name, yaml_path.name)
|
|
except Exception as exc:
|
|
logger.error("eval: failed to load journey fixture %s: %s", yaml_path.name, exc)
|
|
|
|
return fixtures
|