refactor(eval): 3-mode eval harness (step1/step2/full) with Langfuse fixes

- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants - Rewrite runner with _run_step1, _run_step2, _run_full dispatch - CLI: replace --variants with --mode flag - Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full) - Remove old freelance_invoices fixture - Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full) - Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode - Langfuse: post separate classification_precision/recall/f1 scores for full mode - Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1) - Langfuse: include step1_results in trace output - MockExecutor: mock async_session to bypass DB in full mode - Journey fixture: remove user_messages (only interactive test kept)
2026-03-24 16:18:51 +01:00
parent 63fa119543
commit d3f7099d93
13 changed files with 1409 additions and 439 deletions
--- a/services/batch-agent/eval/interactive.py
+++ b/services/batch-agent/eval/interactive.py
@@ -0,0 +1,471 @@
+"""Interactive journey session — human-in-the-loop CLI conversation.
+
+Flow:
+1. Show the system prompt used by the journey AI.
+2. Start the journey (AI explores files, asks first question).
+3. User types responses in the terminal — AI replies.
+4. User types `/done` to end the conversation.
+5. User writes a comment about the interaction quality.
+6. LLM judge scores the conversation + generated template.
+7. Results are reported to Langfuse.
+
+Usage::
+
+    python -m eval interactive                        # pick a fixture interactively
+    python -m eval interactive --fixture=journey-invoice-setup
+    python -m eval interactive --model=gpt-4o
+    python -m eval interactive --judge-model=github_copilot/gpt-4o-mini
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import sys
+import time
+import uuid
+from dataclasses import dataclass, field
+from typing import Any
+
+from langchain_core.messages import HumanMessage, SystemMessage
+
+from eval.config import JourneyFixture, discover_journey_fixtures
+from eval.mock_executor import MockExecutor
+from eval import langfuse_eval
+
+logger = logging.getLogger(__name__)
+
+# ── Special commands ─────────────────────────────────────────────────────
+
+_CMD_DONE = "/done"
+_CMD_QUIT = "/quit"
+_CMD_TEMPLATE = "/template"
+_CMD_HELP = "/help"
+
+_HELP_TEXT = f"""\
+  {_CMD_DONE}       — End the conversation and proceed to evaluation
+  {_CMD_QUIT}       — Abort without evaluation
+  {_CMD_TEMPLATE}   — Show the generated template (if any)
+  {_CMD_HELP}       — Show this help"""
+
+# ── Terminal colours (ANSI) ──────────────────────────────────────────────
+
+_C_RESET = "\033[0m"
+_C_BOLD = "\033[1m"
+_C_DIM = "\033[2m"
+_C_CYAN = "\033[36m"
+_C_GREEN = "\033[32m"
+_C_YELLOW = "\033[33m"
+_C_MAGENTA = "\033[35m"
+_C_RED = "\033[31m"
+_C_BLUE = "\033[34m"
+
+
+def _print_header(text: str) -> None:
+    print(f"\n{_C_BOLD}{_C_CYAN}{'═' * 80}")
+    print(f"  {text}")
+    print(f"{'═' * 80}{_C_RESET}\n")
+
+
+def _print_ai(text: str) -> None:
+    print(f"\n{_C_GREEN}{_C_BOLD}AI:{_C_RESET} {text}\n")
+
+
+def _print_system(text: str) -> None:
+    print(f"{_C_DIM}{text}{_C_RESET}")
+
+
+def _print_score(label: str, score: float) -> None:
+    if score >= 0.7:
+        color = _C_GREEN
+        tag = "PASS"
+    elif score >= 0.4:
+        color = _C_YELLOW
+        tag = "PARTIAL"
+    else:
+        color = _C_RED
+        tag = "FAIL"
+    print(f"  {color}{tag:>7}{_C_RESET} ({score:.1f}) {label}")
+
+
+# ── Result type ──────────────────────────────────────────────────────────
+
+
+@dataclass
+class InteractiveResult:
+    fixture_name: str
+    model: str
+    judge_model: str
+    prompt_template: str | None
+    conversation: list[dict[str, str]]
+    user_comment: str
+    done: bool
+    criteria_scores: dict[str, float]
+    overall_score: float
+    judge_reasoning: str
+    elapsed_seconds: float
+
+    def summary(self) -> dict[str, Any]:
+        return {
+            "fixture": self.fixture_name,
+            "model": self.model,
+            "judge_model": self.judge_model,
+            "done": self.done,
+            "turns": len([c for c in self.conversation if c["role"] == "user"]),
+            "overall_score": round(self.overall_score, 3),
+            "user_comment": self.user_comment,
+            "criteria_scores": {k: round(v, 3) for k, v in self.criteria_scores.items()},
+            "elapsed_s": round(self.elapsed_seconds, 1),
+        }
+
+
+# ── LLM judge ────────────────────────────────────────────────────────────
+
+_INTERACTIVE_JUDGE_SYSTEM = """\
+You are an evaluation judge for AI-generated prompt templates produced during
+an interactive conversation between a human and a journey chatbot.
+
+The chatbot explored a directory and through multi-turn conversation with the
+user produced a prompt_template — an instruction set for a data-extraction agent.
+
+You have access to:
+- The full conversation transcript
+- The generated prompt_template (if any)
+- The user's own comment about the interaction
+- A list of quality criteria
+
+Score each criterion from 0 to 1:
+  - 1.0: Fully satisfied
+  - 0.5: Partially satisfied
+  - 0.0: Not satisfied
+
+Also provide an overall_quality score (0-1) evaluating the conversation flow,
+how well the AI understood the user, and the template quality.
+
+Respond with ONLY a JSON object:
+{
+  "criteria_scores": {"criterion_1": 0.8, ...},
+  "overall_quality": 0.85,
+  "reasoning": "Brief explanation covering both conversation quality and template accuracy"
+}
+"""
+
+
+async def _judge_interactive(
+    conversation: list[dict[str, str]],
+    prompt_template: str | None,
+    user_comment: str,
+    criteria: list[str],
+    *,
+    judge_model: str = "gpt-4o-mini",
+) -> tuple[dict[str, float], float, str]:
+    """Score an interactive session. Returns (criteria_scores, overall_quality, reasoning)."""
+    from shared.llm import get_llm
+
+    llm = get_llm(model=judge_model, temperature=0)
+
+    conv_text = "\n".join(
+        f"{'USER' if t['role'] == 'user' else 'AI'}: {t['content']}"
+        for t in conversation
+    )
+    criteria_text = "\n".join(f"  {i+1}. {c}" for i, c in enumerate(criteria))
+
+    user_content = (
+        f"## Conversation transcript\n```\n{conv_text}\n```\n\n"
+        f"## Generated prompt_template\n```\n{prompt_template or '(none — conversation did not complete)'}\n```\n\n"
+        f"## User's comment\n{user_comment}\n\n"
+        f"## Criteria to evaluate\n{criteria_text}"
+    )
+
+    try:
+        response = await llm.ainvoke([
+            SystemMessage(content=_INTERACTIVE_JUDGE_SYSTEM),
+            HumanMessage(content=user_content),
+        ])
+        raw = response.content.strip()
+        if raw.startswith("```"):
+            raw = raw.split("```")[1]
+            if raw.startswith("json"):
+                raw = raw[4:]
+        parsed = json.loads(raw.strip())
+
+        scores_raw = parsed.get("criteria_scores", parsed.get("scores", {}))
+        criteria_scores: dict[str, float] = {}
+        for i, criterion in enumerate(criteria):
+            key_candidates = [f"criterion_{i+1}", criterion, criterion[:50], str(i + 1)]
+            score = 0.0
+            for key in key_candidates:
+                if key in scores_raw:
+                    score = float(scores_raw[key])
+                    break
+            if score == 0.0 and i < len(scores_raw):
+                score = float(list(scores_raw.values())[i])
+            criteria_scores[criterion] = score
+
+        overall = float(parsed.get("overall_quality", 0.0))
+        reasoning = str(parsed.get("reasoning", ""))
+        return criteria_scores, overall, reasoning
+
+    except Exception as exc:
+        logger.warning("interactive judge failed: %s", exc)
+        return {c: 0.0 for c in criteria}, 0.0, f"Judge error: {exc}"
+
+
+# ── Interactive session ──────────────────────────────────────────────────
+
+
+async def run_interactive(
+    fixture: JourneyFixture,
+    *,
+    model: str = "gpt-4o",
+    judge_model: str = "gpt-4o-mini",
+    data_dir: Path | None = None,
+) -> InteractiveResult:
+    """Run an interactive journey session in the terminal.
+
+    Parameters
+    ----------
+    data_dir :
+        If set, overrides the fixture's sample-file directory.  The LLM
+        will explore this folder instead of the default
+        ``fixtures/sample_files/…``.  Useful for private test data that
+        shouldn't be committed to git.
+    """
+    from shared.config import settings
+    from shared.ws_context import set_current_user, clear_current_user
+    from app.journey import (
+        handle_journey_start,
+        handle_journey_message,
+        _build_system_prompt,
+    )
+
+    # When --data-dir is given, the MockExecutor's root becomes
+    # data_dir's parent and the journey directory is data_dir's name.
+    # This way the LLM sees a meaningful directory name (not ".") and
+    # MockExecutor resolves paths correctly.
+    # Otherwise, use the fixture's YAML parent and its relative path.
+    if data_dir:
+        mock_root = data_dir.parent
+        journey_directory = data_dir.name
+    else:
+        mock_root = fixture.fixture_path.parent
+        journey_directory = fixture.directory
+
+    mock = MockExecutor(
+        fixture_dir=mock_root,
+        seed_records={},
+    )
+
+    original_model = settings.LLM_MODEL
+    settings.LLM_MODEL = model
+    eval_user_id = f"interactive-{uuid.uuid4().hex[:8]}"
+
+    # ── Show system prompt ───────────────────────────────────────
+    system_prompt = _build_system_prompt(journey_directory, fixture.data_types)
+
+    _print_header("SYSTEM PROMPT")
+    print(f"{_C_DIM}{system_prompt}{_C_RESET}")
+
+    _print_header(f"INTERACTIVE JOURNEY  |  fixture: {fixture.name}  |  model: {model}")
+    print(f"  Data dir: {mock_root}")
+    print(f"  Type your responses. Commands: {_CMD_DONE}, {_CMD_QUIT}, {_CMD_TEMPLATE}, {_CMD_HELP}")
+    print(f"  Judge model: {judge_model}")
+    print(f"  Criteria: {len(fixture.expected_template_criteria)}")
+    print()
+
+    conversation: list[dict[str, str]] = []
+    prompt_template: str | None = None
+    done = False
+    start_time = time.time()
+
+    try:
+        set_current_user(eval_user_id)
+
+        with mock.patch():
+            # ── Start ────────────────────────────────────────────
+            _print_system("Starting journey... (AI is exploring your files)")
+
+            start_frame: dict[str, Any] = {
+                "agent_type": "local",
+                "directory": journey_directory,
+                "data_types": fixture.data_types,
+                "session_id": f"interactive-{uuid.uuid4().hex[:8]}",
+            }
+
+            reply = await handle_journey_start(eval_user_id, start_frame)
+            session_id = reply["session_id"]
+            conversation.append({"role": "assistant", "content": reply["message"]})
+            _print_ai(reply["message"])
+
+            if reply["done"]:
+                prompt_template = reply.get("prompt_template")
+                done = True
+                _print_system("Journey completed on first reply (template generated).")
+
+            # ── Conversation loop ────────────────────────────────
+            while not done:
+                try:
+                    user_input = input(f"{_C_BOLD}{_C_BLUE}YOU:{_C_RESET} ").strip()
+                except (EOFError, KeyboardInterrupt):
+                    print()
+                    user_input = _CMD_QUIT
+
+                if not user_input:
+                    continue
+
+                # Handle commands
+                if user_input.lower() == _CMD_QUIT:
+                    _print_system("Aborted — no evaluation will be performed.")
+                    settings.LLM_MODEL = original_model
+                    clear_current_user()
+                    return InteractiveResult(
+                        fixture_name=fixture.name, model=model, judge_model=judge_model,
+                        prompt_template=None, conversation=conversation,
+                        user_comment="(aborted)", done=False,
+                        criteria_scores={}, overall_score=0.0,
+                        judge_reasoning="Session aborted by user.",
+                        elapsed_seconds=time.time() - start_time,
+                    )
+
+                if user_input.lower() == _CMD_HELP:
+                    print(_HELP_TEXT)
+                    continue
+
+                if user_input.lower() == _CMD_TEMPLATE:
+                    if prompt_template:
+                        print(f"\n{_C_MAGENTA}{prompt_template}{_C_RESET}\n")
+                    else:
+                        _print_system("No template generated yet.")
+                    continue
+
+                if user_input.lower() == _CMD_DONE:
+                    _print_system("Ending conversation...")
+                    break
+
+                # ── Send message to AI ───────────────────────────
+                conversation.append({"role": "user", "content": user_input})
+                _print_system("AI is thinking...")
+
+                msg_frame: dict[str, Any] = {
+                    "session_id": session_id,
+                    "message": user_input,
+                }
+                reply = await handle_journey_message(eval_user_id, msg_frame)
+                conversation.append({"role": "assistant", "content": reply["message"]})
+                _print_ai(reply["message"])
+
+                if reply["done"]:
+                    prompt_template = reply.get("prompt_template")
+                    done = True
+                    _print_system("Journey completed — template generated!")
+
+    except Exception as exc:
+        logger.error("interactive journey failed: %s", exc)
+        _print_system(f"Error: {exc}")
+    finally:
+        settings.LLM_MODEL = original_model
+        clear_current_user()
+
+    elapsed = time.time() - start_time
+    turns = len([c for c in conversation if c["role"] == "user"])
+
+    # ── Show template if generated ───────────────────────────────
+    if prompt_template:
+        _print_header("GENERATED TEMPLATE")
+        print(f"{_C_MAGENTA}{prompt_template}{_C_RESET}\n")
+    else:
+        _print_system("No template was generated during this session.")
+
+    # ── User comment ─────────────────────────────────────────────
+    _print_header("YOUR EVALUATION")
+    print("  Write your comment about this interaction (press Enter twice to finish):")
+    print()
+    comment_lines: list[str] = []
+    try:
+        while True:
+            line = input()
+            if line == "" and comment_lines and comment_lines[-1] == "":
+                comment_lines.pop()  # remove trailing empty
+                break
+            comment_lines.append(line)
+    except (EOFError, KeyboardInterrupt):
+        pass
+    user_comment = "\n".join(comment_lines).strip() or "(no comment)"
+
+    # ── Judge ────────────────────────────────────────────────────
+    _print_header("LLM JUDGE EVALUATION")
+    _print_system(f"Scoring with {judge_model}...")
+
+    criteria_scores, overall_quality, judge_reasoning = await _judge_interactive(
+        conversation=conversation,
+        prompt_template=prompt_template,
+        user_comment=user_comment,
+        criteria=fixture.expected_template_criteria,
+        judge_model=judge_model,
+    )
+
+    # ── Display scores ───────────────────────────────────────────
+    print()
+    for criterion, score in criteria_scores.items():
+        _print_score(criterion, score)
+
+    overall = (
+        sum(criteria_scores.values()) / len(criteria_scores)
+        if criteria_scores
+        else 0.0
+    )
+
+    print(f"\n  {_C_BOLD}Criteria avg:      {overall:.2f}{_C_RESET}")
+    print(f"  {_C_BOLD}Overall quality:   {overall_quality:.2f}{_C_RESET}")
+    print(f"  {_C_BOLD}Turns:             {turns}{_C_RESET}")
+    print(f"  {_C_BOLD}Time:              {elapsed:.1f}s{_C_RESET}")
+    print(f"\n  {_C_DIM}Judge: {judge_reasoning}{_C_RESET}")
+    print(f"  {_C_DIM}Your comment: {user_comment}{_C_RESET}\n")
+
+    result = InteractiveResult(
+        fixture_name=fixture.name,
+        model=model,
+        judge_model=judge_model,
+        prompt_template=prompt_template,
+        conversation=conversation,
+        user_comment=user_comment,
+        done=done,
+        criteria_scores=criteria_scores,
+        overall_score=overall_quality,
+        judge_reasoning=judge_reasoning,
+        elapsed_seconds=elapsed,
+    )
+
+    # ── Report to Langfuse ───────────────────────────────────────
+    trace_id = langfuse_eval.log_eval_trace(
+        fixture_name=fixture.name,
+        model=model,
+        prompt_variant="interactive",
+        prompt_template=prompt_template or "(not generated)",
+        actual_mutations=[{
+            "conversation": conversation[:30],
+            "user_comment": user_comment,
+        }],
+        scores_summary=result.summary(),
+        langfuse_prompt_names=["journey_system"],
+    )
+
+    if trace_id:
+        from eval.scorer import EvalScores
+        scores_obj = EvalScores(
+            fixture_name=fixture.name,
+            model=model,
+            prompt_variant="interactive",
+            precision=overall,
+            recall=float(done),
+            f1=overall,
+            llm_judge_score=overall_quality,
+            llm_judge_reasoning=judge_reasoning,
+        )
+        langfuse_eval.post_eval_scores(scores_obj, trace_id=trace_id)
+        _print_system(f"Results reported to Langfuse (trace: {trace_id})")
+    else:
+        _print_system("Langfuse not configured — results not reported.")
+
+    return result