"""Interactive journey session — human-in-the-loop CLI conversation. Flow: 1. Show the system prompt used by the journey AI. 2. Start the journey (AI explores files, asks first question). 3. User types responses in the terminal — AI replies. 4. User types `/done` to end the conversation. 5. User writes a comment about the interaction quality. 6. LLM judge scores the conversation + generated template. 7. Results are reported to Langfuse. Usage:: python -m eval interactive # pick a fixture interactively python -m eval interactive --fixture=journey-invoice-setup python -m eval interactive --model=gpt-4o python -m eval interactive --judge-model=github_copilot/gpt-4o-mini """ from __future__ import annotations import asyncio import json import logging import sys import time import uuid from dataclasses import dataclass, field from typing import Any from langchain_core.messages import HumanMessage, SystemMessage from eval.config import JourneyFixture, discover_journey_fixtures from eval.mock_executor import MockExecutor from eval import langfuse_eval logger = logging.getLogger(__name__) # ── Special commands ───────────────────────────────────────────────────── _CMD_DONE = "/done" _CMD_QUIT = "/quit" _CMD_TEMPLATE = "/template" _CMD_HELP = "/help" _HELP_TEXT = f"""\ {_CMD_DONE} — End the conversation and proceed to evaluation {_CMD_QUIT} — Abort without evaluation {_CMD_TEMPLATE} — Show the generated template (if any) {_CMD_HELP} — Show this help""" # ── Terminal colours (ANSI) ────────────────────────────────────────────── _C_RESET = "\033[0m" _C_BOLD = "\033[1m" _C_DIM = "\033[2m" _C_CYAN = "\033[36m" _C_GREEN = "\033[32m" _C_YELLOW = "\033[33m" _C_MAGENTA = "\033[35m" _C_RED = "\033[31m" _C_BLUE = "\033[34m" def _print_header(text: str) -> None: print(f"\n{_C_BOLD}{_C_CYAN}{'═' * 80}") print(f" {text}") print(f"{'═' * 80}{_C_RESET}\n") def _print_ai(text: str) -> None: print(f"\n{_C_GREEN}{_C_BOLD}AI:{_C_RESET} {text}\n") def _print_system(text: str) -> None: print(f"{_C_DIM}{text}{_C_RESET}") def _print_score(label: str, score: float) -> None: if score >= 0.7: color = _C_GREEN tag = "PASS" elif score >= 0.4: color = _C_YELLOW tag = "PARTIAL" else: color = _C_RED tag = "FAIL" print(f" {color}{tag:>7}{_C_RESET} ({score:.1f}) {label}") # ── Result type ────────────────────────────────────────────────────────── @dataclass class InteractiveResult: fixture_name: str model: str judge_model: str prompt_template: str | None conversation: list[dict[str, str]] user_comment: str done: bool criteria_scores: dict[str, float] overall_score: float judge_reasoning: str elapsed_seconds: float def summary(self) -> dict[str, Any]: return { "fixture": self.fixture_name, "model": self.model, "judge_model": self.judge_model, "done": self.done, "turns": len([c for c in self.conversation if c["role"] == "user"]), "overall_score": round(self.overall_score, 3), "user_comment": self.user_comment, "criteria_scores": {k: round(v, 3) for k, v in self.criteria_scores.items()}, "elapsed_s": round(self.elapsed_seconds, 1), } # ── LLM judge ──────────────────────────────────────────────────────────── _INTERACTIVE_JUDGE_SYSTEM = """\ You are an evaluation judge for AI-generated prompt templates produced during an interactive conversation between a human and a journey chatbot. The chatbot explored a directory and through multi-turn conversation with the user produced a prompt_template — an instruction set for a data-extraction agent. You have access to: - The full conversation transcript - The generated prompt_template (if any) - The user's own comment about the interaction - A list of quality criteria Score each criterion from 0 to 1: - 1.0: Fully satisfied - 0.5: Partially satisfied - 0.0: Not satisfied Also provide an overall_quality score (0-1) evaluating the conversation flow, how well the AI understood the user, and the template quality. Respond with ONLY a JSON object: { "criteria_scores": {"criterion_1": 0.8, ...}, "overall_quality": 0.85, "reasoning": "Brief explanation covering both conversation quality and template accuracy" } """ async def _judge_interactive( conversation: list[dict[str, str]], prompt_template: str | None, user_comment: str, criteria: list[str], *, judge_model: str = "gpt-4o-mini", ) -> tuple[dict[str, float], float, str]: """Score an interactive session. Returns (criteria_scores, overall_quality, reasoning).""" from shared.llm import get_llm llm = get_llm(model=judge_model, temperature=0) conv_text = "\n".join( f"{'USER' if t['role'] == 'user' else 'AI'}: {t['content']}" for t in conversation ) criteria_text = "\n".join(f" {i+1}. {c}" for i, c in enumerate(criteria)) user_content = ( f"## Conversation transcript\n```\n{conv_text}\n```\n\n" f"## Generated prompt_template\n```\n{prompt_template or '(none — conversation did not complete)'}\n```\n\n" f"## User's comment\n{user_comment}\n\n" f"## Criteria to evaluate\n{criteria_text}" ) try: response = await llm.ainvoke([ SystemMessage(content=_INTERACTIVE_JUDGE_SYSTEM), HumanMessage(content=user_content), ]) raw = response.content.strip() if raw.startswith("```"): raw = raw.split("```")[1] if raw.startswith("json"): raw = raw[4:] parsed = json.loads(raw.strip()) scores_raw = parsed.get("criteria_scores", parsed.get("scores", {})) criteria_scores: dict[str, float] = {} for i, criterion in enumerate(criteria): key_candidates = [f"criterion_{i+1}", criterion, criterion[:50], str(i + 1)] score = 0.0 for key in key_candidates: if key in scores_raw: score = float(scores_raw[key]) break if score == 0.0 and i < len(scores_raw): score = float(list(scores_raw.values())[i]) criteria_scores[criterion] = score overall = float(parsed.get("overall_quality", 0.0)) reasoning = str(parsed.get("reasoning", "")) return criteria_scores, overall, reasoning except Exception as exc: logger.warning("interactive judge failed: %s", exc) return {c: 0.0 for c in criteria}, 0.0, f"Judge error: {exc}" # ── Interactive session ────────────────────────────────────────────────── async def run_interactive( fixture: JourneyFixture, *, model: str = "gpt-4o", judge_model: str = "gpt-4o-mini", data_dir: Path | None = None, ) -> InteractiveResult: """Run an interactive journey session in the terminal. Parameters ---------- data_dir : If set, overrides the fixture's sample-file directory. The LLM will explore this folder instead of the default ``fixtures/sample_files/…``. Useful for private test data that shouldn't be committed to git. """ from shared.config import settings from shared.ws_context import set_current_user, clear_current_user from app.journey import ( handle_journey_start, handle_journey_message, _build_system_prompt, ) # When --data-dir is given, the MockExecutor's root becomes # data_dir's parent and the journey directory is data_dir's name. # This way the LLM sees a meaningful directory name (not ".") and # MockExecutor resolves paths correctly. # Otherwise, use the fixture's YAML parent and its relative path. if data_dir: mock_root = data_dir.parent journey_directory = data_dir.name else: mock_root = fixture.fixture_path.parent journey_directory = fixture.directory mock = MockExecutor( fixture_dir=mock_root, seed_records={}, ) original_model = settings.LLM_MODEL settings.LLM_MODEL = model eval_user_id = f"interactive-{uuid.uuid4().hex[:8]}" # ── Show system prompt ─────────────────────────────────────── system_prompt = _build_system_prompt(journey_directory, fixture.data_types) _print_header("SYSTEM PROMPT") print(f"{_C_DIM}{system_prompt}{_C_RESET}") _print_header(f"INTERACTIVE JOURNEY | fixture: {fixture.name} | model: {model}") print(f" Data dir: {mock_root}") print(f" Type your responses. Commands: {_CMD_DONE}, {_CMD_QUIT}, {_CMD_TEMPLATE}, {_CMD_HELP}") print(f" Judge model: {judge_model}") print(f" Criteria: {len(fixture.expected_template_criteria)}") print() conversation: list[dict[str, str]] = [] prompt_template: str | None = None done = False start_time = time.time() try: set_current_user(eval_user_id) with mock.patch(): # ── Start ──────────────────────────────────────────── _print_system("Starting journey... (AI is exploring your files)") start_frame: dict[str, Any] = { "agent_type": "local", "directory": journey_directory, "data_types": fixture.data_types, "session_id": f"interactive-{uuid.uuid4().hex[:8]}", } reply = await handle_journey_start(eval_user_id, start_frame) session_id = reply["session_id"] conversation.append({"role": "assistant", "content": reply["message"]}) _print_ai(reply["message"]) if reply["done"]: prompt_template = reply.get("prompt_template") done = True _print_system("Journey completed on first reply (template generated).") # ── Conversation loop ──────────────────────────────── while not done: try: user_input = input(f"{_C_BOLD}{_C_BLUE}YOU:{_C_RESET} ").strip() except (EOFError, KeyboardInterrupt): print() user_input = _CMD_QUIT if not user_input: continue # Handle commands if user_input.lower() == _CMD_QUIT: _print_system("Aborted — no evaluation will be performed.") settings.LLM_MODEL = original_model clear_current_user() return InteractiveResult( fixture_name=fixture.name, model=model, judge_model=judge_model, prompt_template=None, conversation=conversation, user_comment="(aborted)", done=False, criteria_scores={}, overall_score=0.0, judge_reasoning="Session aborted by user.", elapsed_seconds=time.time() - start_time, ) if user_input.lower() == _CMD_HELP: print(_HELP_TEXT) continue if user_input.lower() == _CMD_TEMPLATE: if prompt_template: print(f"\n{_C_MAGENTA}{prompt_template}{_C_RESET}\n") else: _print_system("No template generated yet.") continue if user_input.lower() == _CMD_DONE: _print_system("Ending conversation...") break # ── Send message to AI ─────────────────────────── conversation.append({"role": "user", "content": user_input}) _print_system("AI is thinking...") msg_frame: dict[str, Any] = { "session_id": session_id, "message": user_input, } reply = await handle_journey_message(eval_user_id, msg_frame) conversation.append({"role": "assistant", "content": reply["message"]}) _print_ai(reply["message"]) if reply["done"]: prompt_template = reply.get("prompt_template") done = True _print_system("Journey completed — template generated!") except Exception as exc: logger.error("interactive journey failed: %s", exc) _print_system(f"Error: {exc}") finally: settings.LLM_MODEL = original_model clear_current_user() elapsed = time.time() - start_time turns = len([c for c in conversation if c["role"] == "user"]) # ── Show template if generated ─────────────────────────────── if prompt_template: _print_header("GENERATED TEMPLATE") print(f"{_C_MAGENTA}{prompt_template}{_C_RESET}\n") else: _print_system("No template was generated during this session.") # ── User comment ───────────────────────────────────────────── _print_header("YOUR EVALUATION") print(" Write your comment about this interaction (press Enter twice to finish):") print() comment_lines: list[str] = [] try: while True: line = input() if line == "" and comment_lines and comment_lines[-1] == "": comment_lines.pop() # remove trailing empty break comment_lines.append(line) except (EOFError, KeyboardInterrupt): pass user_comment = "\n".join(comment_lines).strip() or "(no comment)" # ── Judge ──────────────────────────────────────────────────── _print_header("LLM JUDGE EVALUATION") _print_system(f"Scoring with {judge_model}...") criteria_scores, overall_quality, judge_reasoning = await _judge_interactive( conversation=conversation, prompt_template=prompt_template, user_comment=user_comment, criteria=fixture.expected_template_criteria, judge_model=judge_model, ) # ── Display scores ─────────────────────────────────────────── print() for criterion, score in criteria_scores.items(): _print_score(criterion, score) overall = ( sum(criteria_scores.values()) / len(criteria_scores) if criteria_scores else 0.0 ) print(f"\n {_C_BOLD}Criteria avg: {overall:.2f}{_C_RESET}") print(f" {_C_BOLD}Overall quality: {overall_quality:.2f}{_C_RESET}") print(f" {_C_BOLD}Turns: {turns}{_C_RESET}") print(f" {_C_BOLD}Time: {elapsed:.1f}s{_C_RESET}") print(f"\n {_C_DIM}Judge: {judge_reasoning}{_C_RESET}") print(f" {_C_DIM}Your comment: {user_comment}{_C_RESET}\n") result = InteractiveResult( fixture_name=fixture.name, model=model, judge_model=judge_model, prompt_template=prompt_template, conversation=conversation, user_comment=user_comment, done=done, criteria_scores=criteria_scores, overall_score=overall_quality, judge_reasoning=judge_reasoning, elapsed_seconds=elapsed, ) # ── Report to Langfuse ─────────────────────────────────────── trace_id = langfuse_eval.log_eval_trace( fixture_name=fixture.name, model=model, prompt_variant="interactive", prompt_template=prompt_template or "(not generated)", actual_mutations=[{ "conversation": conversation[:30], "user_comment": user_comment, }], scores_summary=result.summary(), langfuse_prompt_names=["journey_system"], ) if trace_id: from eval.scorer import EvalScores scores_obj = EvalScores( fixture_name=fixture.name, model=model, prompt_variant="interactive", precision=overall, recall=float(done), f1=overall, llm_judge_score=overall_quality, llm_judge_reasoning=judge_reasoning, ) langfuse_eval.post_eval_scores(scores_obj, trace_id=trace_id) _print_system(f"Results reported to Langfuse (trace: {trace_id})") else: _print_system("Langfuse not configured — results not reported.") return result