feat(batch-agent): add journey eval to E2E harness
- journey_runner.py: orchestrates journey start → simulated user messages → template extraction → LLM judge scoring - config.py: JourneyFixture dataclass with user_messages and expected_template_criteria, discover_journey_fixtures() - langfuse_eval.py: sync_journey_fixture_to_dataset() - cli.py: new 'journey' subcommand (python -m eval journey) with --fixture, --models, --judge-model flags - fixtures/journey_invoice_setup.yaml: example journey fixture with 4 user messages and 8 quality criteria
This commit is contained in:
@@ -3,13 +3,17 @@
|
|||||||
Usage::
|
Usage::
|
||||||
|
|
||||||
# From services/batch-agent/:
|
# From services/batch-agent/:
|
||||||
python -m eval run # all fixtures, default model
|
python -m eval run # all agent fixtures, default model
|
||||||
python -m eval run --fixture=freelance-invoices # single fixture
|
python -m eval run --fixture=freelance-invoices # single fixture
|
||||||
python -m eval run --models=gpt-4o,anthropic/claude-sonnet-4
|
python -m eval run --models=gpt-4o,anthropic/claude-sonnet-4
|
||||||
python -m eval run --variants=baseline,detailed # specific prompt variants
|
python -m eval run --variants=baseline,detailed # specific prompt variants
|
||||||
python -m eval run --no-judge # skip LLM judge scoring
|
python -m eval run --no-judge # skip LLM judge scoring
|
||||||
|
|
||||||
python -m eval list # list available fixtures
|
python -m eval journey # all journey fixtures
|
||||||
|
python -m eval journey --fixture=journey-invoices # single journey fixture
|
||||||
|
python -m eval journey --models=gpt-4o,anthropic/claude-sonnet-4
|
||||||
|
|
||||||
|
python -m eval list # list all fixtures
|
||||||
python -m eval sync # sync fixtures to Langfuse datasets
|
python -m eval sync # sync fixtures to Langfuse datasets
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -28,8 +32,9 @@ for p in (_SERVICE_ROOT, _REPO_ROOT):
|
|||||||
if str(p) not in sys.path:
|
if str(p) not in sys.path:
|
||||||
sys.path.insert(0, str(p))
|
sys.path.insert(0, str(p))
|
||||||
|
|
||||||
from eval.config import discover_fixtures
|
from eval.config import discover_fixtures, discover_journey_fixtures
|
||||||
from eval.runner import run_fixture_eval, print_results
|
from eval.runner import run_fixture_eval, print_results
|
||||||
|
from eval.journey_runner import run_journey_fixture_eval, print_journey_results
|
||||||
from eval import langfuse_eval
|
from eval import langfuse_eval
|
||||||
|
|
||||||
|
|
||||||
@@ -90,6 +95,29 @@ def _parse_args() -> argparse.Namespace:
|
|||||||
list_cmd.add_argument("--fixtures-dir", default=None)
|
list_cmd.add_argument("--fixtures-dir", default=None)
|
||||||
list_cmd.add_argument("-v", "--verbose", action="store_true")
|
list_cmd.add_argument("-v", "--verbose", action="store_true")
|
||||||
|
|
||||||
|
# ── journey ───────────────────────────────────────────────────
|
||||||
|
journey_cmd = sub.add_parser("journey", help="Run journey evaluations")
|
||||||
|
journey_cmd.add_argument(
|
||||||
|
"--fixture", "-f",
|
||||||
|
help="Run only the named journey fixture (default: all)",
|
||||||
|
)
|
||||||
|
journey_cmd.add_argument(
|
||||||
|
"--models", "-m",
|
||||||
|
default="gpt-4o",
|
||||||
|
help="Comma-separated list of models to test (default: gpt-4o)",
|
||||||
|
)
|
||||||
|
journey_cmd.add_argument(
|
||||||
|
"--judge-model",
|
||||||
|
default="gpt-4o-mini",
|
||||||
|
help="Model for LLM judge (default: gpt-4o-mini)",
|
||||||
|
)
|
||||||
|
journey_cmd.add_argument(
|
||||||
|
"--fixtures-dir",
|
||||||
|
default=None,
|
||||||
|
help="Path to fixtures directory (default: eval/fixtures/)",
|
||||||
|
)
|
||||||
|
journey_cmd.add_argument("-v", "--verbose", action="store_true")
|
||||||
|
|
||||||
# ── sync ──────────────────────────────────────────────────────
|
# ── sync ──────────────────────────────────────────────────────
|
||||||
sync_cmd = sub.add_parser("sync", help="Sync fixtures to Langfuse datasets")
|
sync_cmd = sub.add_parser("sync", help="Sync fixtures to Langfuse datasets")
|
||||||
sync_cmd.add_argument("--fixture", "-f", default=None, help="Sync only the named fixture")
|
sync_cmd.add_argument("--fixture", "-f", default=None, help="Sync only the named fixture")
|
||||||
@@ -136,25 +164,41 @@ async def _cmd_run(args: argparse.Namespace) -> None:
|
|||||||
|
|
||||||
def _cmd_list(args: argparse.Namespace) -> None:
|
def _cmd_list(args: argparse.Namespace) -> None:
|
||||||
fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir))
|
fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir))
|
||||||
if not fixtures:
|
journey_fixtures = discover_journey_fixtures(_fixtures_dir(args.fixtures_dir))
|
||||||
|
|
||||||
|
if not fixtures and not journey_fixtures:
|
||||||
print("No fixtures found.")
|
print("No fixtures found.")
|
||||||
return
|
return
|
||||||
|
|
||||||
print(f"\n{'Name':<30} {'Types':<25} {'Variants':<20} {'Expected'}")
|
if fixtures:
|
||||||
|
print(f"\n{'[Agent Fixtures]'}")
|
||||||
|
print(f"{'Name':<30} {'Types':<25} {'Variants':<20} {'Expected'}")
|
||||||
print("-" * 90)
|
print("-" * 90)
|
||||||
for f in fixtures:
|
for f in fixtures:
|
||||||
variants = ", ".join(f.prompt_variants.keys())
|
variants = ", ".join(f.prompt_variants.keys())
|
||||||
types = ", ".join(f.data_types)
|
types = ", ".join(f.data_types)
|
||||||
print(f"{f.name:<30} {types:<25} {variants:<20} {len(f.expected)}")
|
print(f"{f.name:<30} {types:<25} {variants:<20} {len(f.expected)}")
|
||||||
|
|
||||||
|
if journey_fixtures:
|
||||||
|
print(f"\n{'[Journey Fixtures]'}")
|
||||||
|
print(f"{'Name':<30} {'Types':<25} {'Messages':<10} {'Criteria'}")
|
||||||
|
print("-" * 90)
|
||||||
|
for f in journey_fixtures:
|
||||||
|
types = ", ".join(f.data_types)
|
||||||
|
print(f"{f.name:<30} {types:<25} {len(f.user_messages):<10} {len(f.expected_template_criteria)}")
|
||||||
|
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
||||||
def _cmd_sync(args: argparse.Namespace) -> None:
|
def _cmd_sync(args: argparse.Namespace) -> None:
|
||||||
fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir))
|
fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir))
|
||||||
|
journey_fixtures = discover_journey_fixtures(_fixtures_dir(args.fixtures_dir))
|
||||||
|
|
||||||
if args.fixture:
|
if args.fixture:
|
||||||
fixtures = [f for f in fixtures if f.name == args.fixture]
|
fixtures = [f for f in fixtures if f.name == args.fixture]
|
||||||
|
journey_fixtures = [f for f in journey_fixtures if f.name == args.fixture]
|
||||||
|
|
||||||
if not fixtures:
|
if not fixtures and not journey_fixtures:
|
||||||
print("No fixtures to sync.")
|
print("No fixtures to sync.")
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -165,6 +209,39 @@ def _cmd_sync(args: argparse.Namespace) -> None:
|
|||||||
else:
|
else:
|
||||||
print(f"Skipped: {fixture.name} (Langfuse not configured)")
|
print(f"Skipped: {fixture.name} (Langfuse not configured)")
|
||||||
|
|
||||||
|
for fixture in journey_fixtures:
|
||||||
|
name = langfuse_eval.sync_journey_fixture_to_dataset(fixture)
|
||||||
|
if name:
|
||||||
|
print(f"Synced: {fixture.name} → {name}")
|
||||||
|
else:
|
||||||
|
print(f"Skipped: {fixture.name} (Langfuse not configured)")
|
||||||
|
|
||||||
|
|
||||||
|
async def _cmd_journey(args: argparse.Namespace) -> None:
|
||||||
|
journey_fixtures = discover_journey_fixtures(_fixtures_dir(args.fixtures_dir))
|
||||||
|
if not journey_fixtures:
|
||||||
|
print("No journey fixtures found. Create YAML files with type: journey in eval/fixtures/.")
|
||||||
|
return
|
||||||
|
|
||||||
|
if args.fixture:
|
||||||
|
journey_fixtures = [f for f in journey_fixtures if f.name == args.fixture]
|
||||||
|
if not journey_fixtures:
|
||||||
|
print(f"Journey fixture '{args.fixture}' not found.")
|
||||||
|
return
|
||||||
|
|
||||||
|
models = [m.strip() for m in args.models.split(",")]
|
||||||
|
|
||||||
|
all_results = []
|
||||||
|
for fixture in journey_fixtures:
|
||||||
|
results = await run_journey_fixture_eval(
|
||||||
|
fixture,
|
||||||
|
models=models,
|
||||||
|
judge_model=args.judge_model,
|
||||||
|
)
|
||||||
|
all_results.extend(results)
|
||||||
|
|
||||||
|
print_journey_results(all_results)
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
args = _parse_args()
|
args = _parse_args()
|
||||||
@@ -172,6 +249,8 @@ def main() -> None:
|
|||||||
|
|
||||||
if args.command == "run":
|
if args.command == "run":
|
||||||
asyncio.run(_cmd_run(args))
|
asyncio.run(_cmd_run(args))
|
||||||
|
elif args.command == "journey":
|
||||||
|
asyncio.run(_cmd_journey(args))
|
||||||
elif args.command == "list":
|
elif args.command == "list":
|
||||||
_cmd_list(args)
|
_cmd_list(args)
|
||||||
elif args.command == "sync":
|
elif args.command == "sync":
|
||||||
|
|||||||
@@ -40,6 +40,31 @@ A *fixture* is a YAML file that defines a complete test scenario:
|
|||||||
|
|
||||||
# Optional: models to test (overrides CLI --models)
|
# Optional: models to test (overrides CLI --models)
|
||||||
models: []
|
models: []
|
||||||
|
|
||||||
|
A *journey fixture* tests the prompt-template builder conversation:
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
type: journey
|
||||||
|
name: journey-invoices
|
||||||
|
description: Test journey builds a good template for invoices
|
||||||
|
directory: sample_files/invoices
|
||||||
|
data_types: [tasks, notes]
|
||||||
|
|
||||||
|
# Simulated user responses for multi-turn conversation
|
||||||
|
user_messages:
|
||||||
|
- "I want to extract action items and meeting summaries"
|
||||||
|
- "Yes, map URGENTE to high priority"
|
||||||
|
- "That looks good, generate the template"
|
||||||
|
|
||||||
|
# Criteria the generated prompt_template should satisfy
|
||||||
|
expected_template_criteria:
|
||||||
|
- "mentions tasks and notes as target entities"
|
||||||
|
- "includes priority mapping rules"
|
||||||
|
- "references isAiSuggested=1"
|
||||||
|
- "does not mention projectId"
|
||||||
|
|
||||||
|
models: []
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
@@ -121,9 +146,73 @@ def discover_fixtures(fixtures_dir: Path | None = None) -> list[EvalFixture]:
|
|||||||
|
|
||||||
for yaml_path in sorted(fixtures_dir.glob("*.yaml")):
|
for yaml_path in sorted(fixtures_dir.glob("*.yaml")):
|
||||||
try:
|
try:
|
||||||
|
raw = yaml.safe_load(yaml_path.read_text(encoding="utf-8"))
|
||||||
|
if raw.get("type") == "journey":
|
||||||
|
continue # Skip journey fixtures
|
||||||
fixtures.append(EvalFixture.from_yaml(yaml_path))
|
fixtures.append(EvalFixture.from_yaml(yaml_path))
|
||||||
logger.info("eval: loaded fixture %s from %s", fixtures[-1].name, yaml_path.name)
|
logger.info("eval: loaded fixture %s from %s", fixtures[-1].name, yaml_path.name)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.error("eval: failed to load fixture %s: %s", yaml_path.name, exc)
|
logger.error("eval: failed to load fixture %s: %s", yaml_path.name, exc)
|
||||||
|
|
||||||
return fixtures
|
return fixtures
|
||||||
|
|
||||||
|
|
||||||
|
# ── Journey fixtures ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class JourneyFixture:
|
||||||
|
"""A journey test scenario — tests the prompt_template builder conversation."""
|
||||||
|
|
||||||
|
name: str
|
||||||
|
description: str
|
||||||
|
directory: str # relative path to sample files
|
||||||
|
data_types: list[str]
|
||||||
|
user_messages: list[str] # simulated user responses
|
||||||
|
expected_template_criteria: list[str] # what the template should contain/satisfy
|
||||||
|
models: list[str]
|
||||||
|
fixture_path: Path = field(default_factory=lambda: Path("."))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def fixture_dir(self) -> Path:
|
||||||
|
"""Absolute path to the sample files directory."""
|
||||||
|
return self.fixture_path.parent / self.directory
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_yaml(cls, path: Path) -> "JourneyFixture":
|
||||||
|
"""Load a journey fixture from a YAML file."""
|
||||||
|
raw = yaml.safe_load(path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
return cls(
|
||||||
|
name=raw["name"],
|
||||||
|
description=raw.get("description", ""),
|
||||||
|
directory=raw.get("directory", "sample_files"),
|
||||||
|
data_types=raw.get("data_types", ["tasks"]),
|
||||||
|
user_messages=raw.get("user_messages", []),
|
||||||
|
expected_template_criteria=raw.get("expected_template_criteria", []),
|
||||||
|
models=raw.get("models", []),
|
||||||
|
fixture_path=path,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def discover_journey_fixtures(fixtures_dir: Path | None = None) -> list[JourneyFixture]:
|
||||||
|
"""Find and load all journey YAML fixtures in the fixtures directory."""
|
||||||
|
if fixtures_dir is None:
|
||||||
|
fixtures_dir = Path(__file__).parent / "fixtures"
|
||||||
|
|
||||||
|
fixtures: list[JourneyFixture] = []
|
||||||
|
if not fixtures_dir.is_dir():
|
||||||
|
logger.warning("eval: fixtures directory not found: %s", fixtures_dir)
|
||||||
|
return fixtures
|
||||||
|
|
||||||
|
for yaml_path in sorted(fixtures_dir.glob("*.yaml")):
|
||||||
|
try:
|
||||||
|
raw = yaml.safe_load(yaml_path.read_text(encoding="utf-8"))
|
||||||
|
if raw.get("type") != "journey":
|
||||||
|
continue
|
||||||
|
fixtures.append(JourneyFixture.from_yaml(yaml_path))
|
||||||
|
logger.info("eval: loaded journey fixture %s from %s", fixtures[-1].name, yaml_path.name)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error("eval: failed to load journey fixture %s: %s", yaml_path.name, exc)
|
||||||
|
|
||||||
|
return fixtures
|
||||||
|
|||||||
@@ -0,0 +1,46 @@
|
|||||||
|
# Journey Fixture: journey-invoice-setup
|
||||||
|
# Tests that the journey chatbot correctly builds a prompt_template
|
||||||
|
# for extracting tasks and notes from Italian invoices and meeting notes.
|
||||||
|
|
||||||
|
type: journey
|
||||||
|
name: journey-invoice-setup
|
||||||
|
description: >
|
||||||
|
Test the journey chatbot's ability to explore a directory of Italian
|
||||||
|
invoices and meeting notes, ask relevant questions, and produce a
|
||||||
|
well-structured prompt_template for data extraction.
|
||||||
|
|
||||||
|
directory: sample_files/invoices
|
||||||
|
data_types: [tasks, notes, timelines]
|
||||||
|
|
||||||
|
# Simulated user responses (the journey starts with the LLM exploring
|
||||||
|
# the directory and asking its first question)
|
||||||
|
user_messages:
|
||||||
|
- >
|
||||||
|
I want to extract action items from invoices and meeting notes.
|
||||||
|
The invoices are in Italian and contain work descriptions with
|
||||||
|
deadlines. Meeting notes have action items with checkboxes.
|
||||||
|
- >
|
||||||
|
Yes, map Italian priority keywords: "URGENTE" and "ALTA PRIORITÀ"
|
||||||
|
should be high priority, "media priorità" is medium, "bassa priorità"
|
||||||
|
is low. Items marked with [x] are already completed.
|
||||||
|
- >
|
||||||
|
For notes, I want meeting summaries with the full content including
|
||||||
|
decisions and attendees. For timelines, extract deadlines and
|
||||||
|
scheduled meeting dates.
|
||||||
|
- >
|
||||||
|
That's everything I need. Please generate the template.
|
||||||
|
|
||||||
|
# Criteria the generated prompt_template must satisfy
|
||||||
|
# Each is scored 0-1 by an LLM judge
|
||||||
|
expected_template_criteria:
|
||||||
|
- "Mentions creating tasks from action items and work descriptions"
|
||||||
|
- "Includes Italian priority keyword mapping (URGENTE→high, media priorità→medium, bassa priorità→low)"
|
||||||
|
- "Handles completed items marked with [x] as status done"
|
||||||
|
- "Mentions creating notes from meeting summaries"
|
||||||
|
- "Mentions extracting timeline events from deadlines and meeting dates"
|
||||||
|
- "Sets isAiSuggested=1 on all created records"
|
||||||
|
- "Does NOT include projectId assignment logic"
|
||||||
|
- "Uses camelCase field names (title, status, priority, dueDate, content)"
|
||||||
|
|
||||||
|
# Models to test (empty = use CLI --models default)
|
||||||
|
models: []
|
||||||
372
services/batch-agent/eval/journey_runner.py
Normal file
372
services/batch-agent/eval/journey_runner.py
Normal file
@@ -0,0 +1,372 @@
|
|||||||
|
"""Journey eval runner — tests the prompt_template builder conversation.
|
||||||
|
|
||||||
|
For each (journey_fixture × model) combination:
|
||||||
|
1. Build a MockExecutor (for filesystem tools used during journey)
|
||||||
|
2. Patch execute_on_client
|
||||||
|
3. Override LLM_MODEL
|
||||||
|
4. Call handle_journey_start to kick off the conversation
|
||||||
|
5. Feed simulated user_messages via handle_journey_message
|
||||||
|
6. Collect the generated prompt_template
|
||||||
|
7. Score it against expected_template_criteria (via LLM judge)
|
||||||
|
8. Report to Langfuse
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import copy
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from langchain_core.messages import HumanMessage, SystemMessage
|
||||||
|
|
||||||
|
from eval.config import JourneyFixture
|
||||||
|
from eval.mock_executor import MockExecutor
|
||||||
|
from eval import langfuse_eval
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Result type ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class JourneyEvalResult:
|
||||||
|
"""Result of one journey eval run."""
|
||||||
|
|
||||||
|
fixture_name: str
|
||||||
|
model: str
|
||||||
|
prompt_template: str | None # the generated template (None if journey failed)
|
||||||
|
conversation_turns: int
|
||||||
|
done: bool # whether journey reached completion
|
||||||
|
criteria_scores: dict[str, float] # criterion → 0-1 score
|
||||||
|
overall_score: float # average of criteria scores
|
||||||
|
judge_reasoning: str
|
||||||
|
elapsed_seconds: float
|
||||||
|
|
||||||
|
def summary(self) -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"fixture": self.fixture_name,
|
||||||
|
"model": self.model,
|
||||||
|
"done": self.done,
|
||||||
|
"turns": self.conversation_turns,
|
||||||
|
"overall_score": round(self.overall_score, 3),
|
||||||
|
"criteria_scores": {k: round(v, 3) for k, v in self.criteria_scores.items()},
|
||||||
|
"elapsed_s": round(self.elapsed_seconds, 1),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ── LLM judge for template quality ──────────────────────────────────────
|
||||||
|
|
||||||
|
_JOURNEY_JUDGE_SYSTEM = """\
|
||||||
|
You are an evaluation judge for AI-generated prompt templates.
|
||||||
|
|
||||||
|
A journey chatbot explored a user's directory structure and through
|
||||||
|
conversation produced a prompt_template — an instruction set for a
|
||||||
|
data-extraction agent.
|
||||||
|
|
||||||
|
Your task: evaluate the generated template against a list of criteria.
|
||||||
|
Score each criterion from 0 to 1:
|
||||||
|
- 1.0: Fully satisfied, clearly present in the template
|
||||||
|
- 0.5: Partially satisfied or ambiguously addressed
|
||||||
|
- 0.0: Not satisfied, missing from the template
|
||||||
|
|
||||||
|
Respond with ONLY a JSON object:
|
||||||
|
{
|
||||||
|
"scores": {"criterion_1": 0.8, "criterion_2": 1.0, ...},
|
||||||
|
"reasoning": "Brief explanation"
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
async def _judge_template(
|
||||||
|
prompt_template: str,
|
||||||
|
criteria: list[str],
|
||||||
|
*,
|
||||||
|
judge_model: str = "gpt-4o-mini",
|
||||||
|
) -> tuple[dict[str, float], str]:
|
||||||
|
"""Use an LLM to evaluate a generated prompt_template against criteria.
|
||||||
|
|
||||||
|
Returns (criteria_scores, reasoning).
|
||||||
|
"""
|
||||||
|
from app.llm import get_llm
|
||||||
|
|
||||||
|
llm = get_llm(model=judge_model, temperature=0)
|
||||||
|
|
||||||
|
criteria_text = "\n".join(f" {i+1}. {c}" for i, c in enumerate(criteria))
|
||||||
|
user_content = (
|
||||||
|
f"## Generated prompt_template\n```\n{prompt_template}\n```\n\n"
|
||||||
|
f"## Criteria to evaluate\n{criteria_text}"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await llm.ainvoke([
|
||||||
|
SystemMessage(content=_JOURNEY_JUDGE_SYSTEM),
|
||||||
|
HumanMessage(content=user_content),
|
||||||
|
])
|
||||||
|
raw = response.content.strip()
|
||||||
|
if raw.startswith("```"):
|
||||||
|
raw = raw.split("```")[1]
|
||||||
|
if raw.startswith("json"):
|
||||||
|
raw = raw[4:]
|
||||||
|
parsed = json.loads(raw.strip())
|
||||||
|
|
||||||
|
scores_raw = parsed.get("scores", {})
|
||||||
|
# Map criterion keys back to the original criteria text
|
||||||
|
criteria_scores: dict[str, float] = {}
|
||||||
|
for i, criterion in enumerate(criteria):
|
||||||
|
# Try matching by index key or exact criterion text
|
||||||
|
key_candidates = [
|
||||||
|
f"criterion_{i+1}",
|
||||||
|
criterion,
|
||||||
|
criterion[:50],
|
||||||
|
str(i + 1),
|
||||||
|
]
|
||||||
|
score = 0.0
|
||||||
|
for key in key_candidates:
|
||||||
|
if key in scores_raw:
|
||||||
|
score = float(scores_raw[key])
|
||||||
|
break
|
||||||
|
# If no match found, try values in order
|
||||||
|
if score == 0.0 and i < len(scores_raw):
|
||||||
|
score = float(list(scores_raw.values())[i])
|
||||||
|
criteria_scores[criterion] = score
|
||||||
|
|
||||||
|
reasoning = str(parsed.get("reasoning", ""))
|
||||||
|
return criteria_scores, reasoning
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("journey_eval: LLM judge failed: %s", exc)
|
||||||
|
return {c: 0.0 for c in criteria}, f"Judge error: {exc}"
|
||||||
|
|
||||||
|
|
||||||
|
# ── Journey runner ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
async def run_single_journey_eval(
|
||||||
|
fixture: JourneyFixture,
|
||||||
|
model: str,
|
||||||
|
*,
|
||||||
|
judge_model: str = "gpt-4o-mini",
|
||||||
|
) -> JourneyEvalResult:
|
||||||
|
"""Execute one journey eval: start → messages → score template."""
|
||||||
|
from shared.config import settings
|
||||||
|
|
||||||
|
# Build mock executor for filesystem tools
|
||||||
|
mock = MockExecutor(
|
||||||
|
fixture_dir=fixture.fixture_dir,
|
||||||
|
seed_records={},
|
||||||
|
)
|
||||||
|
|
||||||
|
original_model = settings.LLM_MODEL
|
||||||
|
settings.LLM_MODEL = model
|
||||||
|
|
||||||
|
eval_user_id = f"eval-journey-{uuid.uuid4().hex[:8]}"
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"journey_eval: starting %s | model=%s",
|
||||||
|
fixture.name, model,
|
||||||
|
)
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
prompt_template: str | None = None
|
||||||
|
conversation: list[dict[str, str]] = []
|
||||||
|
done = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
from app.ws_context import set_current_user, clear_current_user
|
||||||
|
from app.journey import handle_journey_start, handle_journey_message, _sessions
|
||||||
|
|
||||||
|
set_current_user(eval_user_id)
|
||||||
|
with mock.patch():
|
||||||
|
# ── Start the journey ────────────────────────────────
|
||||||
|
start_frame: dict[str, Any] = {
|
||||||
|
"agent_type": "local",
|
||||||
|
"directory": fixture.directory,
|
||||||
|
"data_types": fixture.data_types,
|
||||||
|
"session_id": f"eval-{uuid.uuid4().hex[:8]}",
|
||||||
|
}
|
||||||
|
|
||||||
|
reply = await handle_journey_start(eval_user_id, start_frame)
|
||||||
|
session_id = reply["session_id"]
|
||||||
|
conversation.append({"role": "assistant", "content": reply["message"]})
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"journey_eval: start reply (%d chars), done=%s",
|
||||||
|
len(reply["message"]), reply["done"],
|
||||||
|
)
|
||||||
|
|
||||||
|
if reply["done"]:
|
||||||
|
prompt_template = reply.get("prompt_template")
|
||||||
|
done = True
|
||||||
|
else:
|
||||||
|
# ── Send user messages ───────────────────────────
|
||||||
|
for i, user_msg in enumerate(fixture.user_messages):
|
||||||
|
if done:
|
||||||
|
break
|
||||||
|
|
||||||
|
conversation.append({"role": "user", "content": user_msg})
|
||||||
|
|
||||||
|
msg_frame: dict[str, Any] = {
|
||||||
|
"session_id": session_id,
|
||||||
|
"message": user_msg,
|
||||||
|
}
|
||||||
|
reply = await handle_journey_message(eval_user_id, msg_frame)
|
||||||
|
conversation.append({"role": "assistant", "content": reply["message"]})
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"journey_eval: turn %d reply (%d chars), done=%s",
|
||||||
|
i + 1, len(reply["message"]), reply["done"],
|
||||||
|
)
|
||||||
|
|
||||||
|
if reply["done"]:
|
||||||
|
prompt_template = reply.get("prompt_template")
|
||||||
|
done = True
|
||||||
|
|
||||||
|
# If not done after all user messages, send a final nudge
|
||||||
|
if not done:
|
||||||
|
nudge = "Please generate the final prompt_template now. I'm satisfied with the configuration."
|
||||||
|
conversation.append({"role": "user", "content": nudge})
|
||||||
|
|
||||||
|
nudge_frame: dict[str, Any] = {
|
||||||
|
"session_id": session_id,
|
||||||
|
"message": nudge,
|
||||||
|
}
|
||||||
|
reply = await handle_journey_message(eval_user_id, nudge_frame)
|
||||||
|
conversation.append({"role": "assistant", "content": reply["message"]})
|
||||||
|
if reply["done"]:
|
||||||
|
prompt_template = reply.get("prompt_template")
|
||||||
|
done = True
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error("journey_eval: pipeline failed for %s/%s: %s", fixture.name, model, exc)
|
||||||
|
finally:
|
||||||
|
settings.LLM_MODEL = original_model
|
||||||
|
from app.ws_context import clear_current_user
|
||||||
|
clear_current_user()
|
||||||
|
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
turns = len([c for c in conversation if c["role"] == "user"])
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"journey_eval: completed in %.1fs — %d turns, done=%s, template=%s",
|
||||||
|
elapsed, turns, done, "yes" if prompt_template else "no",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Score the template ───────────────────────────────────────
|
||||||
|
criteria_scores: dict[str, float] = {}
|
||||||
|
judge_reasoning = ""
|
||||||
|
|
||||||
|
if prompt_template and fixture.expected_template_criteria:
|
||||||
|
criteria_scores, judge_reasoning = await _judge_template(
|
||||||
|
prompt_template,
|
||||||
|
fixture.expected_template_criteria,
|
||||||
|
judge_model=judge_model,
|
||||||
|
)
|
||||||
|
elif not prompt_template:
|
||||||
|
criteria_scores = {c: 0.0 for c in fixture.expected_template_criteria}
|
||||||
|
judge_reasoning = "No prompt_template was generated — journey did not complete."
|
||||||
|
|
||||||
|
overall = (
|
||||||
|
sum(criteria_scores.values()) / len(criteria_scores)
|
||||||
|
if criteria_scores
|
||||||
|
else 0.0
|
||||||
|
)
|
||||||
|
|
||||||
|
result = JourneyEvalResult(
|
||||||
|
fixture_name=fixture.name,
|
||||||
|
model=model,
|
||||||
|
prompt_template=prompt_template,
|
||||||
|
conversation_turns=turns,
|
||||||
|
done=done,
|
||||||
|
criteria_scores=criteria_scores,
|
||||||
|
overall_score=overall,
|
||||||
|
judge_reasoning=judge_reasoning,
|
||||||
|
elapsed_seconds=elapsed,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Report to Langfuse ───────────────────────────────────────
|
||||||
|
trace_id = langfuse_eval.log_eval_trace(
|
||||||
|
fixture_name=fixture.name,
|
||||||
|
model=model,
|
||||||
|
prompt_variant="journey",
|
||||||
|
prompt_template=prompt_template or "(not generated)",
|
||||||
|
actual_mutations=[{"conversation": conversation[:20]}],
|
||||||
|
scores_summary=result.summary(),
|
||||||
|
)
|
||||||
|
|
||||||
|
if trace_id:
|
||||||
|
from eval.scorer import EvalScores
|
||||||
|
scores_obj = EvalScores(
|
||||||
|
fixture_name=fixture.name,
|
||||||
|
model=model,
|
||||||
|
prompt_variant="journey",
|
||||||
|
precision=overall,
|
||||||
|
recall=float(done),
|
||||||
|
f1=overall,
|
||||||
|
llm_judge_score=overall,
|
||||||
|
llm_judge_reasoning=judge_reasoning,
|
||||||
|
)
|
||||||
|
langfuse_eval.post_eval_scores(scores_obj, trace_id=trace_id)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
async def run_journey_fixture_eval(
|
||||||
|
fixture: JourneyFixture,
|
||||||
|
models: list[str],
|
||||||
|
*,
|
||||||
|
judge_model: str = "gpt-4o-mini",
|
||||||
|
) -> list[JourneyEvalResult]:
|
||||||
|
"""Run all models for a journey fixture."""
|
||||||
|
langfuse_eval.sync_journey_fixture_to_dataset(fixture)
|
||||||
|
|
||||||
|
results: list[JourneyEvalResult] = []
|
||||||
|
for model in models:
|
||||||
|
result = await run_single_journey_eval(
|
||||||
|
fixture, model, judge_model=judge_model,
|
||||||
|
)
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def print_journey_results(results: list[JourneyEvalResult]) -> None:
|
||||||
|
"""Print a formatted summary of journey eval results."""
|
||||||
|
if not results:
|
||||||
|
print("\nNo journey eval results.")
|
||||||
|
return
|
||||||
|
|
||||||
|
print("\n" + "=" * 95)
|
||||||
|
print(f"{'Fixture':<25} {'Model':<25} {'Done':>5} {'Turns':>6} {'Score':>7} {'Time':>7}")
|
||||||
|
print("-" * 95)
|
||||||
|
|
||||||
|
for r in results:
|
||||||
|
done_str = "yes" if r.done else "NO"
|
||||||
|
print(
|
||||||
|
f"{r.fixture_name:<25} {r.model:<25} {done_str:>5} "
|
||||||
|
f"{r.conversation_turns:>6} {r.overall_score:>7.2f} {r.elapsed_seconds:>6.1f}s"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("=" * 95)
|
||||||
|
|
||||||
|
# Criteria breakdown
|
||||||
|
for r in results:
|
||||||
|
if r.criteria_scores:
|
||||||
|
print(f"\n[{r.model}] Criteria scores:")
|
||||||
|
for criterion, score in r.criteria_scores.items():
|
||||||
|
indicator = "PASS" if score >= 0.7 else "PARTIAL" if score >= 0.4 else "FAIL"
|
||||||
|
print(f" {indicator:>7} ({score:.1f}) {criterion}")
|
||||||
|
|
||||||
|
if r.judge_reasoning:
|
||||||
|
print(f" Judge: {r.judge_reasoning}")
|
||||||
|
|
||||||
|
if r.prompt_template:
|
||||||
|
preview = r.prompt_template[:200].replace("\n", " ")
|
||||||
|
print(f" Template preview: {preview}...")
|
||||||
|
|
||||||
|
print()
|
||||||
@@ -96,6 +96,52 @@ def sync_fixture_to_dataset(fixture: EvalFixture) -> str | None:
|
|||||||
return dataset_name
|
return dataset_name
|
||||||
|
|
||||||
|
|
||||||
|
def sync_journey_fixture_to_dataset(fixture) -> str | None:
|
||||||
|
"""Create or update a Langfuse dataset from a journey fixture.
|
||||||
|
|
||||||
|
Each journey fixture becomes a single dataset item with:
|
||||||
|
- input: {directory, data_types, user_messages}
|
||||||
|
- expected_output: {criteria}
|
||||||
|
"""
|
||||||
|
lf = _get_langfuse()
|
||||||
|
if lf is None:
|
||||||
|
logger.info("langfuse_eval: Langfuse not configured — skipping journey dataset sync")
|
||||||
|
return None
|
||||||
|
|
||||||
|
dataset_name = f"journey-eval-{fixture.name}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
lf.create_dataset(
|
||||||
|
name=dataset_name,
|
||||||
|
description=fixture.description,
|
||||||
|
metadata={"type": "journey", "data_types": fixture.data_types},
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass # Dataset may already exist
|
||||||
|
|
||||||
|
item_id = f"{fixture.name}--journey"
|
||||||
|
try:
|
||||||
|
lf.create_dataset_item(
|
||||||
|
dataset_name=dataset_name,
|
||||||
|
id=item_id,
|
||||||
|
input={
|
||||||
|
"directory": fixture.directory,
|
||||||
|
"data_types": fixture.data_types,
|
||||||
|
"user_messages": fixture.user_messages,
|
||||||
|
},
|
||||||
|
expected_output={
|
||||||
|
"criteria": fixture.expected_template_criteria,
|
||||||
|
},
|
||||||
|
metadata={"type": "journey"},
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("langfuse_eval: failed to upsert journey dataset item %s: %s", item_id, exc)
|
||||||
|
|
||||||
|
lf.flush()
|
||||||
|
logger.info("langfuse_eval: synced journey fixture '%s' → dataset '%s'", fixture.name, dataset_name)
|
||||||
|
return dataset_name
|
||||||
|
|
||||||
|
|
||||||
def create_eval_run(
|
def create_eval_run(
|
||||||
dataset_name: str,
|
dataset_name: str,
|
||||||
run_name: str,
|
run_name: str,
|
||||||
|
|||||||
Reference in New Issue
Block a user