api/services/batch-agent/eval/config.py

"""Eval configuration — YAML fixture loader and dataclasses.

A *fixture* is a YAML file that defines a complete test scenario:

.. code-block:: yaml

    name: freelance-invoices
    description: Extract tasks and notes from invoice PDFs (text layer)
    directory: sample_files/invoices      # relative to fixture dir
    data_types: [tasks, notes]
    file_extensions: [txt, md]

    # Preseeded records the agent "sees" as existing data
    seed_records:
      projects:
        - id: proj-1
          name: "Website Redesign"
          status: active
      tasks: []

    # Prompt variations to test (at least one required)
    prompt_variants:
      baseline: |
        Extract action items as tasks and meeting summaries as notes.
        Set priority based on urgency keywords.
      detailed: |
        Extract action items as tasks. Map "URGENT" to high priority,
        "ASAP" to medium. Summaries become notes with full content.

    # Expected extractions — what the agent SHOULD produce
    expected:
      tasks:
        - title: "Send revised invoice to client"
          priority: high
          status: todo
        - title: "Update project timeline"
          priority: medium
      notes:
        - title: "Meeting summary - March kickoff"

    # Optional: models to test (overrides CLI --models)
    models: []

A *journey fixture* tests the prompt-template builder conversation:

.. code-block:: yaml

    type: journey
    name: journey-invoices
    description: Test journey builds a good template for invoices
    directory: sample_files/invoices
    data_types: [tasks, notes]

    # Simulated user responses for multi-turn conversation
    user_messages:
      - "I want to extract action items and meeting summaries"
      - "Yes, map URGENTE to high priority"
      - "That looks good, generate the template"

    # Criteria the generated prompt_template should satisfy
    expected_template_criteria:
      - "mentions tasks and notes as target entities"
      - "includes priority mapping rules"
      - "references isAiSuggested=1"
      - "does not mention projectId"

    models: []
"""

from __future__ import annotations

import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

import yaml

logger = logging.getLogger(__name__)


@dataclass
class ExpectedRecord:
    """A single expected extraction result.

    Only the fields specified are checked — unspecified fields are ignored.
    """

    table: str  # tasks | notes | timelines | projects
    fields: dict[str, Any]  # field_name → expected_value


@dataclass
class EvalFixture:
    """A complete test scenario loaded from YAML."""

    name: str
    description: str
    directory: str  # relative path to sample files
    data_types: list[str]
    file_extensions: list[str]
    seed_records: dict[str, list[dict]]
    prompt_variants: dict[str, str]  # variant_name → prompt_template
    expected: list[ExpectedRecord]
    models: list[str]  # if empty, use CLI default
    fixture_path: Path = field(default_factory=lambda: Path("."))

    @property
    def fixture_dir(self) -> Path:
        """Absolute path to the sample files directory."""
        return self.fixture_path.parent / self.directory

    @classmethod
    def from_yaml(cls, path: Path) -> "EvalFixture":
        """Load a fixture from a YAML file."""
        raw = yaml.safe_load(path.read_text(encoding="utf-8"))

        expected: list[ExpectedRecord] = []
        for table, records in (raw.get("expected") or {}).items():
            for rec in records:
                expected.append(ExpectedRecord(table=table, fields=rec))

        return cls(
            name=raw["name"],
            description=raw.get("description", ""),
            directory=raw.get("directory", "sample_files"),
            data_types=raw.get("data_types", ["tasks"]),
            file_extensions=raw.get("file_extensions", []),
            seed_records=raw.get("seed_records", {}),
            prompt_variants=raw.get("prompt_variants", {"default": ""}),
            expected=expected,
            models=raw.get("models", []),
            fixture_path=path,
        )


def discover_fixtures(fixtures_dir: Path | None = None) -> list[EvalFixture]:
    """Find and load all YAML fixtures in the fixtures directory."""
    if fixtures_dir is None:
        fixtures_dir = Path(__file__).parent / "fixtures"

    fixtures: list[EvalFixture] = []
    if not fixtures_dir.is_dir():
        logger.warning("eval: fixtures directory not found: %s", fixtures_dir)
        return fixtures

    for yaml_path in sorted(fixtures_dir.glob("*.yaml")):
        try:
            raw = yaml.safe_load(yaml_path.read_text(encoding="utf-8"))
            if raw.get("type") == "journey":
                continue  # Skip journey fixtures
            fixtures.append(EvalFixture.from_yaml(yaml_path))
            logger.info("eval: loaded fixture %s from %s", fixtures[-1].name, yaml_path.name)
        except Exception as exc:
            logger.error("eval: failed to load fixture %s: %s", yaml_path.name, exc)

    return fixtures


# ── Journey fixtures ─────────────────────────────────────────────────────


@dataclass
class JourneyFixture:
    """A journey test scenario — tests the prompt_template builder conversation."""

    name: str
    description: str
    directory: str  # relative path to sample files
    data_types: list[str]
    user_messages: list[str]  # simulated user responses
    expected_template_criteria: list[str]  # what the template should contain/satisfy
    models: list[str]
    fixture_path: Path = field(default_factory=lambda: Path("."))

    @property
    def fixture_dir(self) -> Path:
        """Absolute path to the sample files directory."""
        return self.fixture_path.parent / self.directory

    @classmethod
    def from_yaml(cls, path: Path) -> "JourneyFixture":
        """Load a journey fixture from a YAML file."""
        raw = yaml.safe_load(path.read_text(encoding="utf-8"))

        return cls(
            name=raw["name"],
            description=raw.get("description", ""),
            directory=raw.get("directory", "sample_files"),
            data_types=raw.get("data_types", ["tasks"]),
            user_messages=raw.get("user_messages", []),
            expected_template_criteria=raw.get("expected_template_criteria", []),
            models=raw.get("models", []),
            fixture_path=path,
        )


def discover_journey_fixtures(fixtures_dir: Path | None = None) -> list[JourneyFixture]:
    """Find and load all journey YAML fixtures in the fixtures directory."""
    if fixtures_dir is None:
        fixtures_dir = Path(__file__).parent / "fixtures"

    fixtures: list[JourneyFixture] = []
    if not fixtures_dir.is_dir():
        logger.warning("eval: fixtures directory not found: %s", fixtures_dir)
        return fixtures

    for yaml_path in sorted(fixtures_dir.glob("*.yaml")):
        try:
            raw = yaml.safe_load(yaml_path.read_text(encoding="utf-8"))
            if raw.get("type") != "journey":
                continue
            fixtures.append(JourneyFixture.from_yaml(yaml_path))
            logger.info("eval: loaded journey fixture %s from %s", fixtures[-1].name, yaml_path.name)
        except Exception as exc:
            logger.error("eval: failed to load journey fixture %s: %s", yaml_path.name, exc)

    return fixtures