feat(batch-agent): add journey eval to E2E harness

- journey_runner.py: orchestrates journey start → simulated user
  messages → template extraction → LLM judge scoring
- config.py: JourneyFixture dataclass with user_messages and
  expected_template_criteria, discover_journey_fixtures()
- langfuse_eval.py: sync_journey_fixture_to_dataset()
- cli.py: new 'journey' subcommand (python -m eval journey)
  with --fixture, --models, --judge-model flags
- fixtures/journey_invoice_setup.yaml: example journey fixture
  with 4 user messages and 8 quality criteria
This commit is contained in:
Roberto Musso
2026-03-23 23:16:41 +01:00
parent d856dfd28c
commit 63fa119543
5 changed files with 643 additions and 11 deletions

View File

@@ -40,6 +40,31 @@ A *fixture* is a YAML file that defines a complete test scenario:
# Optional: models to test (overrides CLI --models)
models: []
A *journey fixture* tests the prompt-template builder conversation:
.. code-block:: yaml
type: journey
name: journey-invoices
description: Test journey builds a good template for invoices
directory: sample_files/invoices
data_types: [tasks, notes]
# Simulated user responses for multi-turn conversation
user_messages:
- "I want to extract action items and meeting summaries"
- "Yes, map URGENTE to high priority"
- "That looks good, generate the template"
# Criteria the generated prompt_template should satisfy
expected_template_criteria:
- "mentions tasks and notes as target entities"
- "includes priority mapping rules"
- "references isAiSuggested=1"
- "does not mention projectId"
models: []
"""
from __future__ import annotations
@@ -121,9 +146,73 @@ def discover_fixtures(fixtures_dir: Path | None = None) -> list[EvalFixture]:
for yaml_path in sorted(fixtures_dir.glob("*.yaml")):
try:
raw = yaml.safe_load(yaml_path.read_text(encoding="utf-8"))
if raw.get("type") == "journey":
continue # Skip journey fixtures
fixtures.append(EvalFixture.from_yaml(yaml_path))
logger.info("eval: loaded fixture %s from %s", fixtures[-1].name, yaml_path.name)
except Exception as exc:
logger.error("eval: failed to load fixture %s: %s", yaml_path.name, exc)
return fixtures
# ── Journey fixtures ─────────────────────────────────────────────────────
@dataclass
class JourneyFixture:
"""A journey test scenario — tests the prompt_template builder conversation."""
name: str
description: str
directory: str # relative path to sample files
data_types: list[str]
user_messages: list[str] # simulated user responses
expected_template_criteria: list[str] # what the template should contain/satisfy
models: list[str]
fixture_path: Path = field(default_factory=lambda: Path("."))
@property
def fixture_dir(self) -> Path:
"""Absolute path to the sample files directory."""
return self.fixture_path.parent / self.directory
@classmethod
def from_yaml(cls, path: Path) -> "JourneyFixture":
"""Load a journey fixture from a YAML file."""
raw = yaml.safe_load(path.read_text(encoding="utf-8"))
return cls(
name=raw["name"],
description=raw.get("description", ""),
directory=raw.get("directory", "sample_files"),
data_types=raw.get("data_types", ["tasks"]),
user_messages=raw.get("user_messages", []),
expected_template_criteria=raw.get("expected_template_criteria", []),
models=raw.get("models", []),
fixture_path=path,
)
def discover_journey_fixtures(fixtures_dir: Path | None = None) -> list[JourneyFixture]:
"""Find and load all journey YAML fixtures in the fixtures directory."""
if fixtures_dir is None:
fixtures_dir = Path(__file__).parent / "fixtures"
fixtures: list[JourneyFixture] = []
if not fixtures_dir.is_dir():
logger.warning("eval: fixtures directory not found: %s", fixtures_dir)
return fixtures
for yaml_path in sorted(fixtures_dir.glob("*.yaml")):
try:
raw = yaml.safe_load(yaml_path.read_text(encoding="utf-8"))
if raw.get("type") != "journey":
continue
fixtures.append(JourneyFixture.from_yaml(yaml_path))
logger.info("eval: loaded journey fixture %s from %s", fixtures[-1].name, yaml_path.name)
except Exception as exc:
logger.error("eval: failed to load journey fixture %s: %s", yaml_path.name, exc)
return fixtures