feat(batch-agent): add journey eval to E2E harness

- journey_runner.py: orchestrates journey start → simulated user messages → template extraction → LLM judge scoring - config.py: JourneyFixture dataclass with user_messages and expected_template_criteria, discover_journey_fixtures() - langfuse_eval.py: sync_journey_fixture_to_dataset() - cli.py: new 'journey' subcommand (python -m eval journey) with --fixture, --models, --judge-model flags - fixtures/journey_invoice_setup.yaml: example journey fixture with 4 user messages and 8 quality criteria
2026-03-23 23:16:41 +01:00
parent d856dfd28c
commit 63fa119543
5 changed files with 643 additions and 11 deletions
--- a/services/batch-agent/eval/config.py
+++ b/services/batch-agent/eval/config.py
@@ -40,6 +40,31 @@ A *fixture* is a YAML file that defines a complete test scenario:

    # Optional: models to test (overrides CLI --models)
    models: []
+
+A *journey fixture* tests the prompt-template builder conversation:
+
+.. code-block:: yaml
+
+    type: journey
+    name: journey-invoices
+    description: Test journey builds a good template for invoices
+    directory: sample_files/invoices
+    data_types: [tasks, notes]
+
+    # Simulated user responses for multi-turn conversation
+    user_messages:
+      - "I want to extract action items and meeting summaries"
+      - "Yes, map URGENTE to high priority"
+      - "That looks good, generate the template"
+
+    # Criteria the generated prompt_template should satisfy
+    expected_template_criteria:
+      - "mentions tasks and notes as target entities"
+      - "includes priority mapping rules"
+      - "references isAiSuggested=1"
+      - "does not mention projectId"
+
+    models: []
 """

 from __future__ import annotations
@@ -121,9 +146,73 @@ def discover_fixtures(fixtures_dir: Path | None = None) -> list[EvalFixture]:

    for yaml_path in sorted(fixtures_dir.glob("*.yaml")):
        try:
+            raw = yaml.safe_load(yaml_path.read_text(encoding="utf-8"))
+            if raw.get("type") == "journey":
+                continue  # Skip journey fixtures
            fixtures.append(EvalFixture.from_yaml(yaml_path))
            logger.info("eval: loaded fixture %s from %s", fixtures[-1].name, yaml_path.name)
        except Exception as exc:
            logger.error("eval: failed to load fixture %s: %s", yaml_path.name, exc)

    return fixtures
+
+
+# ── Journey fixtures ─────────────────────────────────────────────────────
+
+
+@dataclass
+class JourneyFixture:
+    """A journey test scenario — tests the prompt_template builder conversation."""
+
+    name: str
+    description: str
+    directory: str  # relative path to sample files
+    data_types: list[str]
+    user_messages: list[str]  # simulated user responses
+    expected_template_criteria: list[str]  # what the template should contain/satisfy
+    models: list[str]
+    fixture_path: Path = field(default_factory=lambda: Path("."))
+
+    @property
+    def fixture_dir(self) -> Path:
+        """Absolute path to the sample files directory."""
+        return self.fixture_path.parent / self.directory
+
+    @classmethod
+    def from_yaml(cls, path: Path) -> "JourneyFixture":
+        """Load a journey fixture from a YAML file."""
+        raw = yaml.safe_load(path.read_text(encoding="utf-8"))
+
+        return cls(
+            name=raw["name"],
+            description=raw.get("description", ""),
+            directory=raw.get("directory", "sample_files"),
+            data_types=raw.get("data_types", ["tasks"]),
+            user_messages=raw.get("user_messages", []),
+            expected_template_criteria=raw.get("expected_template_criteria", []),
+            models=raw.get("models", []),
+            fixture_path=path,
+        )
+
+
+def discover_journey_fixtures(fixtures_dir: Path | None = None) -> list[JourneyFixture]:
+    """Find and load all journey YAML fixtures in the fixtures directory."""
+    if fixtures_dir is None:
+        fixtures_dir = Path(__file__).parent / "fixtures"
+
+    fixtures: list[JourneyFixture] = []
+    if not fixtures_dir.is_dir():
+        logger.warning("eval: fixtures directory not found: %s", fixtures_dir)
+        return fixtures
+
+    for yaml_path in sorted(fixtures_dir.glob("*.yaml")):
+        try:
+            raw = yaml.safe_load(yaml_path.read_text(encoding="utf-8"))
+            if raw.get("type") != "journey":
+                continue
+            fixtures.append(JourneyFixture.from_yaml(yaml_path))
+            logger.info("eval: loaded journey fixture %s from %s", fixtures[-1].name, yaml_path.name)
+        except Exception as exc:
+            logger.error("eval: failed to load journey fixture %s: %s", yaml_path.name, exc)
+
+    return fixtures