- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants - Rewrite runner with _run_step1, _run_step2, _run_full dispatch - CLI: replace --variants with --mode flag - Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full) - Remove old freelance_invoices fixture - Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full) - Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode - Langfuse: post separate classification_precision/recall/f1 scores for full mode - Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1) - Langfuse: include step1_results in trace output - MockExecutor: mock async_session to bypass DB in full mode - Journey fixture: remove user_messages (only interactive test kept)
472 lines
17 KiB
Python
472 lines
17 KiB
Python
"""Interactive journey session — human-in-the-loop CLI conversation.
|
|
|
|
Flow:
|
|
1. Show the system prompt used by the journey AI.
|
|
2. Start the journey (AI explores files, asks first question).
|
|
3. User types responses in the terminal — AI replies.
|
|
4. User types `/done` to end the conversation.
|
|
5. User writes a comment about the interaction quality.
|
|
6. LLM judge scores the conversation + generated template.
|
|
7. Results are reported to Langfuse.
|
|
|
|
Usage::
|
|
|
|
python -m eval interactive # pick a fixture interactively
|
|
python -m eval interactive --fixture=journey-invoice-setup
|
|
python -m eval interactive --model=gpt-4o
|
|
python -m eval interactive --judge-model=github_copilot/gpt-4o-mini
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import sys
|
|
import time
|
|
import uuid
|
|
from dataclasses import dataclass, field
|
|
from typing import Any
|
|
|
|
from langchain_core.messages import HumanMessage, SystemMessage
|
|
|
|
from eval.config import JourneyFixture, discover_journey_fixtures
|
|
from eval.mock_executor import MockExecutor
|
|
from eval import langfuse_eval
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ── Special commands ─────────────────────────────────────────────────────
|
|
|
|
_CMD_DONE = "/done"
|
|
_CMD_QUIT = "/quit"
|
|
_CMD_TEMPLATE = "/template"
|
|
_CMD_HELP = "/help"
|
|
|
|
_HELP_TEXT = f"""\
|
|
{_CMD_DONE} — End the conversation and proceed to evaluation
|
|
{_CMD_QUIT} — Abort without evaluation
|
|
{_CMD_TEMPLATE} — Show the generated template (if any)
|
|
{_CMD_HELP} — Show this help"""
|
|
|
|
# ── Terminal colours (ANSI) ──────────────────────────────────────────────
|
|
|
|
_C_RESET = "\033[0m"
|
|
_C_BOLD = "\033[1m"
|
|
_C_DIM = "\033[2m"
|
|
_C_CYAN = "\033[36m"
|
|
_C_GREEN = "\033[32m"
|
|
_C_YELLOW = "\033[33m"
|
|
_C_MAGENTA = "\033[35m"
|
|
_C_RED = "\033[31m"
|
|
_C_BLUE = "\033[34m"
|
|
|
|
|
|
def _print_header(text: str) -> None:
|
|
print(f"\n{_C_BOLD}{_C_CYAN}{'═' * 80}")
|
|
print(f" {text}")
|
|
print(f"{'═' * 80}{_C_RESET}\n")
|
|
|
|
|
|
def _print_ai(text: str) -> None:
|
|
print(f"\n{_C_GREEN}{_C_BOLD}AI:{_C_RESET} {text}\n")
|
|
|
|
|
|
def _print_system(text: str) -> None:
|
|
print(f"{_C_DIM}{text}{_C_RESET}")
|
|
|
|
|
|
def _print_score(label: str, score: float) -> None:
|
|
if score >= 0.7:
|
|
color = _C_GREEN
|
|
tag = "PASS"
|
|
elif score >= 0.4:
|
|
color = _C_YELLOW
|
|
tag = "PARTIAL"
|
|
else:
|
|
color = _C_RED
|
|
tag = "FAIL"
|
|
print(f" {color}{tag:>7}{_C_RESET} ({score:.1f}) {label}")
|
|
|
|
|
|
# ── Result type ──────────────────────────────────────────────────────────
|
|
|
|
|
|
@dataclass
|
|
class InteractiveResult:
|
|
fixture_name: str
|
|
model: str
|
|
judge_model: str
|
|
prompt_template: str | None
|
|
conversation: list[dict[str, str]]
|
|
user_comment: str
|
|
done: bool
|
|
criteria_scores: dict[str, float]
|
|
overall_score: float
|
|
judge_reasoning: str
|
|
elapsed_seconds: float
|
|
|
|
def summary(self) -> dict[str, Any]:
|
|
return {
|
|
"fixture": self.fixture_name,
|
|
"model": self.model,
|
|
"judge_model": self.judge_model,
|
|
"done": self.done,
|
|
"turns": len([c for c in self.conversation if c["role"] == "user"]),
|
|
"overall_score": round(self.overall_score, 3),
|
|
"user_comment": self.user_comment,
|
|
"criteria_scores": {k: round(v, 3) for k, v in self.criteria_scores.items()},
|
|
"elapsed_s": round(self.elapsed_seconds, 1),
|
|
}
|
|
|
|
|
|
# ── LLM judge ────────────────────────────────────────────────────────────
|
|
|
|
_INTERACTIVE_JUDGE_SYSTEM = """\
|
|
You are an evaluation judge for AI-generated prompt templates produced during
|
|
an interactive conversation between a human and a journey chatbot.
|
|
|
|
The chatbot explored a directory and through multi-turn conversation with the
|
|
user produced a prompt_template — an instruction set for a data-extraction agent.
|
|
|
|
You have access to:
|
|
- The full conversation transcript
|
|
- The generated prompt_template (if any)
|
|
- The user's own comment about the interaction
|
|
- A list of quality criteria
|
|
|
|
Score each criterion from 0 to 1:
|
|
- 1.0: Fully satisfied
|
|
- 0.5: Partially satisfied
|
|
- 0.0: Not satisfied
|
|
|
|
Also provide an overall_quality score (0-1) evaluating the conversation flow,
|
|
how well the AI understood the user, and the template quality.
|
|
|
|
Respond with ONLY a JSON object:
|
|
{
|
|
"criteria_scores": {"criterion_1": 0.8, ...},
|
|
"overall_quality": 0.85,
|
|
"reasoning": "Brief explanation covering both conversation quality and template accuracy"
|
|
}
|
|
"""
|
|
|
|
|
|
async def _judge_interactive(
|
|
conversation: list[dict[str, str]],
|
|
prompt_template: str | None,
|
|
user_comment: str,
|
|
criteria: list[str],
|
|
*,
|
|
judge_model: str = "gpt-4o-mini",
|
|
) -> tuple[dict[str, float], float, str]:
|
|
"""Score an interactive session. Returns (criteria_scores, overall_quality, reasoning)."""
|
|
from shared.llm import get_llm
|
|
|
|
llm = get_llm(model=judge_model, temperature=0)
|
|
|
|
conv_text = "\n".join(
|
|
f"{'USER' if t['role'] == 'user' else 'AI'}: {t['content']}"
|
|
for t in conversation
|
|
)
|
|
criteria_text = "\n".join(f" {i+1}. {c}" for i, c in enumerate(criteria))
|
|
|
|
user_content = (
|
|
f"## Conversation transcript\n```\n{conv_text}\n```\n\n"
|
|
f"## Generated prompt_template\n```\n{prompt_template or '(none — conversation did not complete)'}\n```\n\n"
|
|
f"## User's comment\n{user_comment}\n\n"
|
|
f"## Criteria to evaluate\n{criteria_text}"
|
|
)
|
|
|
|
try:
|
|
response = await llm.ainvoke([
|
|
SystemMessage(content=_INTERACTIVE_JUDGE_SYSTEM),
|
|
HumanMessage(content=user_content),
|
|
])
|
|
raw = response.content.strip()
|
|
if raw.startswith("```"):
|
|
raw = raw.split("```")[1]
|
|
if raw.startswith("json"):
|
|
raw = raw[4:]
|
|
parsed = json.loads(raw.strip())
|
|
|
|
scores_raw = parsed.get("criteria_scores", parsed.get("scores", {}))
|
|
criteria_scores: dict[str, float] = {}
|
|
for i, criterion in enumerate(criteria):
|
|
key_candidates = [f"criterion_{i+1}", criterion, criterion[:50], str(i + 1)]
|
|
score = 0.0
|
|
for key in key_candidates:
|
|
if key in scores_raw:
|
|
score = float(scores_raw[key])
|
|
break
|
|
if score == 0.0 and i < len(scores_raw):
|
|
score = float(list(scores_raw.values())[i])
|
|
criteria_scores[criterion] = score
|
|
|
|
overall = float(parsed.get("overall_quality", 0.0))
|
|
reasoning = str(parsed.get("reasoning", ""))
|
|
return criteria_scores, overall, reasoning
|
|
|
|
except Exception as exc:
|
|
logger.warning("interactive judge failed: %s", exc)
|
|
return {c: 0.0 for c in criteria}, 0.0, f"Judge error: {exc}"
|
|
|
|
|
|
# ── Interactive session ──────────────────────────────────────────────────
|
|
|
|
|
|
async def run_interactive(
|
|
fixture: JourneyFixture,
|
|
*,
|
|
model: str = "gpt-4o",
|
|
judge_model: str = "gpt-4o-mini",
|
|
data_dir: Path | None = None,
|
|
) -> InteractiveResult:
|
|
"""Run an interactive journey session in the terminal.
|
|
|
|
Parameters
|
|
----------
|
|
data_dir :
|
|
If set, overrides the fixture's sample-file directory. The LLM
|
|
will explore this folder instead of the default
|
|
``fixtures/sample_files/…``. Useful for private test data that
|
|
shouldn't be committed to git.
|
|
"""
|
|
from shared.config import settings
|
|
from shared.ws_context import set_current_user, clear_current_user
|
|
from app.journey import (
|
|
handle_journey_start,
|
|
handle_journey_message,
|
|
_build_system_prompt,
|
|
)
|
|
|
|
# When --data-dir is given, the MockExecutor's root becomes
|
|
# data_dir's parent and the journey directory is data_dir's name.
|
|
# This way the LLM sees a meaningful directory name (not ".") and
|
|
# MockExecutor resolves paths correctly.
|
|
# Otherwise, use the fixture's YAML parent and its relative path.
|
|
if data_dir:
|
|
mock_root = data_dir.parent
|
|
journey_directory = data_dir.name
|
|
else:
|
|
mock_root = fixture.fixture_path.parent
|
|
journey_directory = fixture.directory
|
|
|
|
mock = MockExecutor(
|
|
fixture_dir=mock_root,
|
|
seed_records={},
|
|
)
|
|
|
|
original_model = settings.LLM_MODEL
|
|
settings.LLM_MODEL = model
|
|
eval_user_id = f"interactive-{uuid.uuid4().hex[:8]}"
|
|
|
|
# ── Show system prompt ───────────────────────────────────────
|
|
system_prompt = _build_system_prompt(journey_directory, fixture.data_types)
|
|
|
|
_print_header("SYSTEM PROMPT")
|
|
print(f"{_C_DIM}{system_prompt}{_C_RESET}")
|
|
|
|
_print_header(f"INTERACTIVE JOURNEY | fixture: {fixture.name} | model: {model}")
|
|
print(f" Data dir: {mock_root}")
|
|
print(f" Type your responses. Commands: {_CMD_DONE}, {_CMD_QUIT}, {_CMD_TEMPLATE}, {_CMD_HELP}")
|
|
print(f" Judge model: {judge_model}")
|
|
print(f" Criteria: {len(fixture.expected_template_criteria)}")
|
|
print()
|
|
|
|
conversation: list[dict[str, str]] = []
|
|
prompt_template: str | None = None
|
|
done = False
|
|
start_time = time.time()
|
|
|
|
try:
|
|
set_current_user(eval_user_id)
|
|
|
|
with mock.patch():
|
|
# ── Start ────────────────────────────────────────────
|
|
_print_system("Starting journey... (AI is exploring your files)")
|
|
|
|
start_frame: dict[str, Any] = {
|
|
"agent_type": "local",
|
|
"directory": journey_directory,
|
|
"data_types": fixture.data_types,
|
|
"session_id": f"interactive-{uuid.uuid4().hex[:8]}",
|
|
}
|
|
|
|
reply = await handle_journey_start(eval_user_id, start_frame)
|
|
session_id = reply["session_id"]
|
|
conversation.append({"role": "assistant", "content": reply["message"]})
|
|
_print_ai(reply["message"])
|
|
|
|
if reply["done"]:
|
|
prompt_template = reply.get("prompt_template")
|
|
done = True
|
|
_print_system("Journey completed on first reply (template generated).")
|
|
|
|
# ── Conversation loop ────────────────────────────────
|
|
while not done:
|
|
try:
|
|
user_input = input(f"{_C_BOLD}{_C_BLUE}YOU:{_C_RESET} ").strip()
|
|
except (EOFError, KeyboardInterrupt):
|
|
print()
|
|
user_input = _CMD_QUIT
|
|
|
|
if not user_input:
|
|
continue
|
|
|
|
# Handle commands
|
|
if user_input.lower() == _CMD_QUIT:
|
|
_print_system("Aborted — no evaluation will be performed.")
|
|
settings.LLM_MODEL = original_model
|
|
clear_current_user()
|
|
return InteractiveResult(
|
|
fixture_name=fixture.name, model=model, judge_model=judge_model,
|
|
prompt_template=None, conversation=conversation,
|
|
user_comment="(aborted)", done=False,
|
|
criteria_scores={}, overall_score=0.0,
|
|
judge_reasoning="Session aborted by user.",
|
|
elapsed_seconds=time.time() - start_time,
|
|
)
|
|
|
|
if user_input.lower() == _CMD_HELP:
|
|
print(_HELP_TEXT)
|
|
continue
|
|
|
|
if user_input.lower() == _CMD_TEMPLATE:
|
|
if prompt_template:
|
|
print(f"\n{_C_MAGENTA}{prompt_template}{_C_RESET}\n")
|
|
else:
|
|
_print_system("No template generated yet.")
|
|
continue
|
|
|
|
if user_input.lower() == _CMD_DONE:
|
|
_print_system("Ending conversation...")
|
|
break
|
|
|
|
# ── Send message to AI ───────────────────────────
|
|
conversation.append({"role": "user", "content": user_input})
|
|
_print_system("AI is thinking...")
|
|
|
|
msg_frame: dict[str, Any] = {
|
|
"session_id": session_id,
|
|
"message": user_input,
|
|
}
|
|
reply = await handle_journey_message(eval_user_id, msg_frame)
|
|
conversation.append({"role": "assistant", "content": reply["message"]})
|
|
_print_ai(reply["message"])
|
|
|
|
if reply["done"]:
|
|
prompt_template = reply.get("prompt_template")
|
|
done = True
|
|
_print_system("Journey completed — template generated!")
|
|
|
|
except Exception as exc:
|
|
logger.error("interactive journey failed: %s", exc)
|
|
_print_system(f"Error: {exc}")
|
|
finally:
|
|
settings.LLM_MODEL = original_model
|
|
clear_current_user()
|
|
|
|
elapsed = time.time() - start_time
|
|
turns = len([c for c in conversation if c["role"] == "user"])
|
|
|
|
# ── Show template if generated ───────────────────────────────
|
|
if prompt_template:
|
|
_print_header("GENERATED TEMPLATE")
|
|
print(f"{_C_MAGENTA}{prompt_template}{_C_RESET}\n")
|
|
else:
|
|
_print_system("No template was generated during this session.")
|
|
|
|
# ── User comment ─────────────────────────────────────────────
|
|
_print_header("YOUR EVALUATION")
|
|
print(" Write your comment about this interaction (press Enter twice to finish):")
|
|
print()
|
|
comment_lines: list[str] = []
|
|
try:
|
|
while True:
|
|
line = input()
|
|
if line == "" and comment_lines and comment_lines[-1] == "":
|
|
comment_lines.pop() # remove trailing empty
|
|
break
|
|
comment_lines.append(line)
|
|
except (EOFError, KeyboardInterrupt):
|
|
pass
|
|
user_comment = "\n".join(comment_lines).strip() or "(no comment)"
|
|
|
|
# ── Judge ────────────────────────────────────────────────────
|
|
_print_header("LLM JUDGE EVALUATION")
|
|
_print_system(f"Scoring with {judge_model}...")
|
|
|
|
criteria_scores, overall_quality, judge_reasoning = await _judge_interactive(
|
|
conversation=conversation,
|
|
prompt_template=prompt_template,
|
|
user_comment=user_comment,
|
|
criteria=fixture.expected_template_criteria,
|
|
judge_model=judge_model,
|
|
)
|
|
|
|
# ── Display scores ───────────────────────────────────────────
|
|
print()
|
|
for criterion, score in criteria_scores.items():
|
|
_print_score(criterion, score)
|
|
|
|
overall = (
|
|
sum(criteria_scores.values()) / len(criteria_scores)
|
|
if criteria_scores
|
|
else 0.0
|
|
)
|
|
|
|
print(f"\n {_C_BOLD}Criteria avg: {overall:.2f}{_C_RESET}")
|
|
print(f" {_C_BOLD}Overall quality: {overall_quality:.2f}{_C_RESET}")
|
|
print(f" {_C_BOLD}Turns: {turns}{_C_RESET}")
|
|
print(f" {_C_BOLD}Time: {elapsed:.1f}s{_C_RESET}")
|
|
print(f"\n {_C_DIM}Judge: {judge_reasoning}{_C_RESET}")
|
|
print(f" {_C_DIM}Your comment: {user_comment}{_C_RESET}\n")
|
|
|
|
result = InteractiveResult(
|
|
fixture_name=fixture.name,
|
|
model=model,
|
|
judge_model=judge_model,
|
|
prompt_template=prompt_template,
|
|
conversation=conversation,
|
|
user_comment=user_comment,
|
|
done=done,
|
|
criteria_scores=criteria_scores,
|
|
overall_score=overall_quality,
|
|
judge_reasoning=judge_reasoning,
|
|
elapsed_seconds=elapsed,
|
|
)
|
|
|
|
# ── Report to Langfuse ───────────────────────────────────────
|
|
trace_id = langfuse_eval.log_eval_trace(
|
|
fixture_name=fixture.name,
|
|
model=model,
|
|
prompt_variant="interactive",
|
|
prompt_template=prompt_template or "(not generated)",
|
|
actual_mutations=[{
|
|
"conversation": conversation[:30],
|
|
"user_comment": user_comment,
|
|
}],
|
|
scores_summary=result.summary(),
|
|
langfuse_prompt_names=["journey_system"],
|
|
)
|
|
|
|
if trace_id:
|
|
from eval.scorer import EvalScores
|
|
scores_obj = EvalScores(
|
|
fixture_name=fixture.name,
|
|
model=model,
|
|
prompt_variant="interactive",
|
|
precision=overall,
|
|
recall=float(done),
|
|
f1=overall,
|
|
llm_judge_score=overall_quality,
|
|
llm_judge_reasoning=judge_reasoning,
|
|
)
|
|
langfuse_eval.post_eval_scores(scores_obj, trace_id=trace_id)
|
|
_print_system(f"Results reported to Langfuse (trace: {trace_id})")
|
|
else:
|
|
_print_system("Langfuse not configured — results not reported.")
|
|
|
|
return result
|