Files
api/services/batch-agent/eval/interactive.py
Roberto Musso d3f7099d93 refactor(eval): 3-mode eval harness (step1/step2/full) with Langfuse fixes
- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants
- Rewrite runner with _run_step1, _run_step2, _run_full dispatch
- CLI: replace --variants with --mode flag
- Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full)
- Remove old freelance_invoices fixture
- Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full)
- Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode
- Langfuse: post separate classification_precision/recall/f1 scores for full mode
- Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1)
- Langfuse: include step1_results in trace output
- MockExecutor: mock async_session to bypass DB in full mode
- Journey fixture: remove user_messages (only interactive test kept)
2026-03-24 16:18:51 +01:00

472 lines
17 KiB
Python

"""Interactive journey session — human-in-the-loop CLI conversation.
Flow:
1. Show the system prompt used by the journey AI.
2. Start the journey (AI explores files, asks first question).
3. User types responses in the terminal — AI replies.
4. User types `/done` to end the conversation.
5. User writes a comment about the interaction quality.
6. LLM judge scores the conversation + generated template.
7. Results are reported to Langfuse.
Usage::
python -m eval interactive # pick a fixture interactively
python -m eval interactive --fixture=journey-invoice-setup
python -m eval interactive --model=gpt-4o
python -m eval interactive --judge-model=github_copilot/gpt-4o-mini
"""
from __future__ import annotations
import asyncio
import json
import logging
import sys
import time
import uuid
from dataclasses import dataclass, field
from typing import Any
from langchain_core.messages import HumanMessage, SystemMessage
from eval.config import JourneyFixture, discover_journey_fixtures
from eval.mock_executor import MockExecutor
from eval import langfuse_eval
logger = logging.getLogger(__name__)
# ── Special commands ─────────────────────────────────────────────────────
_CMD_DONE = "/done"
_CMD_QUIT = "/quit"
_CMD_TEMPLATE = "/template"
_CMD_HELP = "/help"
_HELP_TEXT = f"""\
{_CMD_DONE} — End the conversation and proceed to evaluation
{_CMD_QUIT} — Abort without evaluation
{_CMD_TEMPLATE} — Show the generated template (if any)
{_CMD_HELP} — Show this help"""
# ── Terminal colours (ANSI) ──────────────────────────────────────────────
_C_RESET = "\033[0m"
_C_BOLD = "\033[1m"
_C_DIM = "\033[2m"
_C_CYAN = "\033[36m"
_C_GREEN = "\033[32m"
_C_YELLOW = "\033[33m"
_C_MAGENTA = "\033[35m"
_C_RED = "\033[31m"
_C_BLUE = "\033[34m"
def _print_header(text: str) -> None:
print(f"\n{_C_BOLD}{_C_CYAN}{'' * 80}")
print(f" {text}")
print(f"{'' * 80}{_C_RESET}\n")
def _print_ai(text: str) -> None:
print(f"\n{_C_GREEN}{_C_BOLD}AI:{_C_RESET} {text}\n")
def _print_system(text: str) -> None:
print(f"{_C_DIM}{text}{_C_RESET}")
def _print_score(label: str, score: float) -> None:
if score >= 0.7:
color = _C_GREEN
tag = "PASS"
elif score >= 0.4:
color = _C_YELLOW
tag = "PARTIAL"
else:
color = _C_RED
tag = "FAIL"
print(f" {color}{tag:>7}{_C_RESET} ({score:.1f}) {label}")
# ── Result type ──────────────────────────────────────────────────────────
@dataclass
class InteractiveResult:
fixture_name: str
model: str
judge_model: str
prompt_template: str | None
conversation: list[dict[str, str]]
user_comment: str
done: bool
criteria_scores: dict[str, float]
overall_score: float
judge_reasoning: str
elapsed_seconds: float
def summary(self) -> dict[str, Any]:
return {
"fixture": self.fixture_name,
"model": self.model,
"judge_model": self.judge_model,
"done": self.done,
"turns": len([c for c in self.conversation if c["role"] == "user"]),
"overall_score": round(self.overall_score, 3),
"user_comment": self.user_comment,
"criteria_scores": {k: round(v, 3) for k, v in self.criteria_scores.items()},
"elapsed_s": round(self.elapsed_seconds, 1),
}
# ── LLM judge ────────────────────────────────────────────────────────────
_INTERACTIVE_JUDGE_SYSTEM = """\
You are an evaluation judge for AI-generated prompt templates produced during
an interactive conversation between a human and a journey chatbot.
The chatbot explored a directory and through multi-turn conversation with the
user produced a prompt_template — an instruction set for a data-extraction agent.
You have access to:
- The full conversation transcript
- The generated prompt_template (if any)
- The user's own comment about the interaction
- A list of quality criteria
Score each criterion from 0 to 1:
- 1.0: Fully satisfied
- 0.5: Partially satisfied
- 0.0: Not satisfied
Also provide an overall_quality score (0-1) evaluating the conversation flow,
how well the AI understood the user, and the template quality.
Respond with ONLY a JSON object:
{
"criteria_scores": {"criterion_1": 0.8, ...},
"overall_quality": 0.85,
"reasoning": "Brief explanation covering both conversation quality and template accuracy"
}
"""
async def _judge_interactive(
conversation: list[dict[str, str]],
prompt_template: str | None,
user_comment: str,
criteria: list[str],
*,
judge_model: str = "gpt-4o-mini",
) -> tuple[dict[str, float], float, str]:
"""Score an interactive session. Returns (criteria_scores, overall_quality, reasoning)."""
from shared.llm import get_llm
llm = get_llm(model=judge_model, temperature=0)
conv_text = "\n".join(
f"{'USER' if t['role'] == 'user' else 'AI'}: {t['content']}"
for t in conversation
)
criteria_text = "\n".join(f" {i+1}. {c}" for i, c in enumerate(criteria))
user_content = (
f"## Conversation transcript\n```\n{conv_text}\n```\n\n"
f"## Generated prompt_template\n```\n{prompt_template or '(none — conversation did not complete)'}\n```\n\n"
f"## User's comment\n{user_comment}\n\n"
f"## Criteria to evaluate\n{criteria_text}"
)
try:
response = await llm.ainvoke([
SystemMessage(content=_INTERACTIVE_JUDGE_SYSTEM),
HumanMessage(content=user_content),
])
raw = response.content.strip()
if raw.startswith("```"):
raw = raw.split("```")[1]
if raw.startswith("json"):
raw = raw[4:]
parsed = json.loads(raw.strip())
scores_raw = parsed.get("criteria_scores", parsed.get("scores", {}))
criteria_scores: dict[str, float] = {}
for i, criterion in enumerate(criteria):
key_candidates = [f"criterion_{i+1}", criterion, criterion[:50], str(i + 1)]
score = 0.0
for key in key_candidates:
if key in scores_raw:
score = float(scores_raw[key])
break
if score == 0.0 and i < len(scores_raw):
score = float(list(scores_raw.values())[i])
criteria_scores[criterion] = score
overall = float(parsed.get("overall_quality", 0.0))
reasoning = str(parsed.get("reasoning", ""))
return criteria_scores, overall, reasoning
except Exception as exc:
logger.warning("interactive judge failed: %s", exc)
return {c: 0.0 for c in criteria}, 0.0, f"Judge error: {exc}"
# ── Interactive session ──────────────────────────────────────────────────
async def run_interactive(
fixture: JourneyFixture,
*,
model: str = "gpt-4o",
judge_model: str = "gpt-4o-mini",
data_dir: Path | None = None,
) -> InteractiveResult:
"""Run an interactive journey session in the terminal.
Parameters
----------
data_dir :
If set, overrides the fixture's sample-file directory. The LLM
will explore this folder instead of the default
``fixtures/sample_files/…``. Useful for private test data that
shouldn't be committed to git.
"""
from shared.config import settings
from shared.ws_context import set_current_user, clear_current_user
from app.journey import (
handle_journey_start,
handle_journey_message,
_build_system_prompt,
)
# When --data-dir is given, the MockExecutor's root becomes
# data_dir's parent and the journey directory is data_dir's name.
# This way the LLM sees a meaningful directory name (not ".") and
# MockExecutor resolves paths correctly.
# Otherwise, use the fixture's YAML parent and its relative path.
if data_dir:
mock_root = data_dir.parent
journey_directory = data_dir.name
else:
mock_root = fixture.fixture_path.parent
journey_directory = fixture.directory
mock = MockExecutor(
fixture_dir=mock_root,
seed_records={},
)
original_model = settings.LLM_MODEL
settings.LLM_MODEL = model
eval_user_id = f"interactive-{uuid.uuid4().hex[:8]}"
# ── Show system prompt ───────────────────────────────────────
system_prompt = _build_system_prompt(journey_directory, fixture.data_types)
_print_header("SYSTEM PROMPT")
print(f"{_C_DIM}{system_prompt}{_C_RESET}")
_print_header(f"INTERACTIVE JOURNEY | fixture: {fixture.name} | model: {model}")
print(f" Data dir: {mock_root}")
print(f" Type your responses. Commands: {_CMD_DONE}, {_CMD_QUIT}, {_CMD_TEMPLATE}, {_CMD_HELP}")
print(f" Judge model: {judge_model}")
print(f" Criteria: {len(fixture.expected_template_criteria)}")
print()
conversation: list[dict[str, str]] = []
prompt_template: str | None = None
done = False
start_time = time.time()
try:
set_current_user(eval_user_id)
with mock.patch():
# ── Start ────────────────────────────────────────────
_print_system("Starting journey... (AI is exploring your files)")
start_frame: dict[str, Any] = {
"agent_type": "local",
"directory": journey_directory,
"data_types": fixture.data_types,
"session_id": f"interactive-{uuid.uuid4().hex[:8]}",
}
reply = await handle_journey_start(eval_user_id, start_frame)
session_id = reply["session_id"]
conversation.append({"role": "assistant", "content": reply["message"]})
_print_ai(reply["message"])
if reply["done"]:
prompt_template = reply.get("prompt_template")
done = True
_print_system("Journey completed on first reply (template generated).")
# ── Conversation loop ────────────────────────────────
while not done:
try:
user_input = input(f"{_C_BOLD}{_C_BLUE}YOU:{_C_RESET} ").strip()
except (EOFError, KeyboardInterrupt):
print()
user_input = _CMD_QUIT
if not user_input:
continue
# Handle commands
if user_input.lower() == _CMD_QUIT:
_print_system("Aborted — no evaluation will be performed.")
settings.LLM_MODEL = original_model
clear_current_user()
return InteractiveResult(
fixture_name=fixture.name, model=model, judge_model=judge_model,
prompt_template=None, conversation=conversation,
user_comment="(aborted)", done=False,
criteria_scores={}, overall_score=0.0,
judge_reasoning="Session aborted by user.",
elapsed_seconds=time.time() - start_time,
)
if user_input.lower() == _CMD_HELP:
print(_HELP_TEXT)
continue
if user_input.lower() == _CMD_TEMPLATE:
if prompt_template:
print(f"\n{_C_MAGENTA}{prompt_template}{_C_RESET}\n")
else:
_print_system("No template generated yet.")
continue
if user_input.lower() == _CMD_DONE:
_print_system("Ending conversation...")
break
# ── Send message to AI ───────────────────────────
conversation.append({"role": "user", "content": user_input})
_print_system("AI is thinking...")
msg_frame: dict[str, Any] = {
"session_id": session_id,
"message": user_input,
}
reply = await handle_journey_message(eval_user_id, msg_frame)
conversation.append({"role": "assistant", "content": reply["message"]})
_print_ai(reply["message"])
if reply["done"]:
prompt_template = reply.get("prompt_template")
done = True
_print_system("Journey completed — template generated!")
except Exception as exc:
logger.error("interactive journey failed: %s", exc)
_print_system(f"Error: {exc}")
finally:
settings.LLM_MODEL = original_model
clear_current_user()
elapsed = time.time() - start_time
turns = len([c for c in conversation if c["role"] == "user"])
# ── Show template if generated ───────────────────────────────
if prompt_template:
_print_header("GENERATED TEMPLATE")
print(f"{_C_MAGENTA}{prompt_template}{_C_RESET}\n")
else:
_print_system("No template was generated during this session.")
# ── User comment ─────────────────────────────────────────────
_print_header("YOUR EVALUATION")
print(" Write your comment about this interaction (press Enter twice to finish):")
print()
comment_lines: list[str] = []
try:
while True:
line = input()
if line == "" and comment_lines and comment_lines[-1] == "":
comment_lines.pop() # remove trailing empty
break
comment_lines.append(line)
except (EOFError, KeyboardInterrupt):
pass
user_comment = "\n".join(comment_lines).strip() or "(no comment)"
# ── Judge ────────────────────────────────────────────────────
_print_header("LLM JUDGE EVALUATION")
_print_system(f"Scoring with {judge_model}...")
criteria_scores, overall_quality, judge_reasoning = await _judge_interactive(
conversation=conversation,
prompt_template=prompt_template,
user_comment=user_comment,
criteria=fixture.expected_template_criteria,
judge_model=judge_model,
)
# ── Display scores ───────────────────────────────────────────
print()
for criterion, score in criteria_scores.items():
_print_score(criterion, score)
overall = (
sum(criteria_scores.values()) / len(criteria_scores)
if criteria_scores
else 0.0
)
print(f"\n {_C_BOLD}Criteria avg: {overall:.2f}{_C_RESET}")
print(f" {_C_BOLD}Overall quality: {overall_quality:.2f}{_C_RESET}")
print(f" {_C_BOLD}Turns: {turns}{_C_RESET}")
print(f" {_C_BOLD}Time: {elapsed:.1f}s{_C_RESET}")
print(f"\n {_C_DIM}Judge: {judge_reasoning}{_C_RESET}")
print(f" {_C_DIM}Your comment: {user_comment}{_C_RESET}\n")
result = InteractiveResult(
fixture_name=fixture.name,
model=model,
judge_model=judge_model,
prompt_template=prompt_template,
conversation=conversation,
user_comment=user_comment,
done=done,
criteria_scores=criteria_scores,
overall_score=overall_quality,
judge_reasoning=judge_reasoning,
elapsed_seconds=elapsed,
)
# ── Report to Langfuse ───────────────────────────────────────
trace_id = langfuse_eval.log_eval_trace(
fixture_name=fixture.name,
model=model,
prompt_variant="interactive",
prompt_template=prompt_template or "(not generated)",
actual_mutations=[{
"conversation": conversation[:30],
"user_comment": user_comment,
}],
scores_summary=result.summary(),
langfuse_prompt_names=["journey_system"],
)
if trace_id:
from eval.scorer import EvalScores
scores_obj = EvalScores(
fixture_name=fixture.name,
model=model,
prompt_variant="interactive",
precision=overall,
recall=float(done),
f1=overall,
llm_judge_score=overall_quality,
llm_judge_reasoning=judge_reasoning,
)
langfuse_eval.post_eval_scores(scores_obj, trace_id=trace_id)
_print_system(f"Results reported to Langfuse (trace: {trace_id})")
else:
_print_system("Langfuse not configured — results not reported.")
return result