- journey_runner.py: orchestrates journey start → simulated user messages → template extraction → LLM judge scoring - config.py: JourneyFixture dataclass with user_messages and expected_template_criteria, discover_journey_fixtures() - langfuse_eval.py: sync_journey_fixture_to_dataset() - cli.py: new 'journey' subcommand (python -m eval journey) with --fixture, --models, --judge-model flags - fixtures/journey_invoice_setup.yaml: example journey fixture with 4 user messages and 8 quality criteria
219 lines
7.4 KiB
Python
219 lines
7.4 KiB
Python
"""Eval configuration — YAML fixture loader and dataclasses.
|
|
|
|
A *fixture* is a YAML file that defines a complete test scenario:
|
|
|
|
.. code-block:: yaml
|
|
|
|
name: freelance-invoices
|
|
description: Extract tasks and notes from invoice PDFs (text layer)
|
|
directory: sample_files/invoices # relative to fixture dir
|
|
data_types: [tasks, notes]
|
|
file_extensions: [txt, md]
|
|
|
|
# Preseeded records the agent "sees" as existing data
|
|
seed_records:
|
|
projects:
|
|
- id: proj-1
|
|
name: "Website Redesign"
|
|
status: active
|
|
tasks: []
|
|
|
|
# Prompt variations to test (at least one required)
|
|
prompt_variants:
|
|
baseline: |
|
|
Extract action items as tasks and meeting summaries as notes.
|
|
Set priority based on urgency keywords.
|
|
detailed: |
|
|
Extract action items as tasks. Map "URGENT" to high priority,
|
|
"ASAP" to medium. Summaries become notes with full content.
|
|
|
|
# Expected extractions — what the agent SHOULD produce
|
|
expected:
|
|
tasks:
|
|
- title: "Send revised invoice to client"
|
|
priority: high
|
|
status: todo
|
|
- title: "Update project timeline"
|
|
priority: medium
|
|
notes:
|
|
- title: "Meeting summary - March kickoff"
|
|
|
|
# Optional: models to test (overrides CLI --models)
|
|
models: []
|
|
|
|
A *journey fixture* tests the prompt-template builder conversation:
|
|
|
|
.. code-block:: yaml
|
|
|
|
type: journey
|
|
name: journey-invoices
|
|
description: Test journey builds a good template for invoices
|
|
directory: sample_files/invoices
|
|
data_types: [tasks, notes]
|
|
|
|
# Simulated user responses for multi-turn conversation
|
|
user_messages:
|
|
- "I want to extract action items and meeting summaries"
|
|
- "Yes, map URGENTE to high priority"
|
|
- "That looks good, generate the template"
|
|
|
|
# Criteria the generated prompt_template should satisfy
|
|
expected_template_criteria:
|
|
- "mentions tasks and notes as target entities"
|
|
- "includes priority mapping rules"
|
|
- "references isAiSuggested=1"
|
|
- "does not mention projectId"
|
|
|
|
models: []
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import yaml
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class ExpectedRecord:
|
|
"""A single expected extraction result.
|
|
|
|
Only the fields specified are checked — unspecified fields are ignored.
|
|
"""
|
|
|
|
table: str # tasks | notes | timelines | projects
|
|
fields: dict[str, Any] # field_name → expected_value
|
|
|
|
|
|
@dataclass
|
|
class EvalFixture:
|
|
"""A complete test scenario loaded from YAML."""
|
|
|
|
name: str
|
|
description: str
|
|
directory: str # relative path to sample files
|
|
data_types: list[str]
|
|
file_extensions: list[str]
|
|
seed_records: dict[str, list[dict]]
|
|
prompt_variants: dict[str, str] # variant_name → prompt_template
|
|
expected: list[ExpectedRecord]
|
|
models: list[str] # if empty, use CLI default
|
|
fixture_path: Path = field(default_factory=lambda: Path("."))
|
|
|
|
@property
|
|
def fixture_dir(self) -> Path:
|
|
"""Absolute path to the sample files directory."""
|
|
return self.fixture_path.parent / self.directory
|
|
|
|
@classmethod
|
|
def from_yaml(cls, path: Path) -> "EvalFixture":
|
|
"""Load a fixture from a YAML file."""
|
|
raw = yaml.safe_load(path.read_text(encoding="utf-8"))
|
|
|
|
expected: list[ExpectedRecord] = []
|
|
for table, records in (raw.get("expected") or {}).items():
|
|
for rec in records:
|
|
expected.append(ExpectedRecord(table=table, fields=rec))
|
|
|
|
return cls(
|
|
name=raw["name"],
|
|
description=raw.get("description", ""),
|
|
directory=raw.get("directory", "sample_files"),
|
|
data_types=raw.get("data_types", ["tasks"]),
|
|
file_extensions=raw.get("file_extensions", []),
|
|
seed_records=raw.get("seed_records", {}),
|
|
prompt_variants=raw.get("prompt_variants", {"default": ""}),
|
|
expected=expected,
|
|
models=raw.get("models", []),
|
|
fixture_path=path,
|
|
)
|
|
|
|
|
|
def discover_fixtures(fixtures_dir: Path | None = None) -> list[EvalFixture]:
|
|
"""Find and load all YAML fixtures in the fixtures directory."""
|
|
if fixtures_dir is None:
|
|
fixtures_dir = Path(__file__).parent / "fixtures"
|
|
|
|
fixtures: list[EvalFixture] = []
|
|
if not fixtures_dir.is_dir():
|
|
logger.warning("eval: fixtures directory not found: %s", fixtures_dir)
|
|
return fixtures
|
|
|
|
for yaml_path in sorted(fixtures_dir.glob("*.yaml")):
|
|
try:
|
|
raw = yaml.safe_load(yaml_path.read_text(encoding="utf-8"))
|
|
if raw.get("type") == "journey":
|
|
continue # Skip journey fixtures
|
|
fixtures.append(EvalFixture.from_yaml(yaml_path))
|
|
logger.info("eval: loaded fixture %s from %s", fixtures[-1].name, yaml_path.name)
|
|
except Exception as exc:
|
|
logger.error("eval: failed to load fixture %s: %s", yaml_path.name, exc)
|
|
|
|
return fixtures
|
|
|
|
|
|
# ── Journey fixtures ─────────────────────────────────────────────────────
|
|
|
|
|
|
@dataclass
|
|
class JourneyFixture:
|
|
"""A journey test scenario — tests the prompt_template builder conversation."""
|
|
|
|
name: str
|
|
description: str
|
|
directory: str # relative path to sample files
|
|
data_types: list[str]
|
|
user_messages: list[str] # simulated user responses
|
|
expected_template_criteria: list[str] # what the template should contain/satisfy
|
|
models: list[str]
|
|
fixture_path: Path = field(default_factory=lambda: Path("."))
|
|
|
|
@property
|
|
def fixture_dir(self) -> Path:
|
|
"""Absolute path to the sample files directory."""
|
|
return self.fixture_path.parent / self.directory
|
|
|
|
@classmethod
|
|
def from_yaml(cls, path: Path) -> "JourneyFixture":
|
|
"""Load a journey fixture from a YAML file."""
|
|
raw = yaml.safe_load(path.read_text(encoding="utf-8"))
|
|
|
|
return cls(
|
|
name=raw["name"],
|
|
description=raw.get("description", ""),
|
|
directory=raw.get("directory", "sample_files"),
|
|
data_types=raw.get("data_types", ["tasks"]),
|
|
user_messages=raw.get("user_messages", []),
|
|
expected_template_criteria=raw.get("expected_template_criteria", []),
|
|
models=raw.get("models", []),
|
|
fixture_path=path,
|
|
)
|
|
|
|
|
|
def discover_journey_fixtures(fixtures_dir: Path | None = None) -> list[JourneyFixture]:
|
|
"""Find and load all journey YAML fixtures in the fixtures directory."""
|
|
if fixtures_dir is None:
|
|
fixtures_dir = Path(__file__).parent / "fixtures"
|
|
|
|
fixtures: list[JourneyFixture] = []
|
|
if not fixtures_dir.is_dir():
|
|
logger.warning("eval: fixtures directory not found: %s", fixtures_dir)
|
|
return fixtures
|
|
|
|
for yaml_path in sorted(fixtures_dir.glob("*.yaml")):
|
|
try:
|
|
raw = yaml.safe_load(yaml_path.read_text(encoding="utf-8"))
|
|
if raw.get("type") != "journey":
|
|
continue
|
|
fixtures.append(JourneyFixture.from_yaml(yaml_path))
|
|
logger.info("eval: loaded journey fixture %s from %s", fixtures[-1].name, yaml_path.name)
|
|
except Exception as exc:
|
|
logger.error("eval: failed to load journey fixture %s: %s", yaml_path.name, exc)
|
|
|
|
return fixtures
|