api/tests/test_journey_v2.py

"""Tests for Local Agent V2 journey setup (Step 4).

Covers the chatbot journey that produces a structured ScoutConfig JSON
instead of a freeform prompt_template string.

Unit tests (no LLM)
--------------------
  4.6a  _extract_agent_config: valid JSON → returns serialised config
  4.6b  _extract_agent_config: invalid JSON → returns None
  4.6c  _extract_agent_config: markers absent → returns None
  4.6d  _extract_agent_config: only START marker → returns None
  4.6e  Session not found → done=True, agent_config=None
  4.6f  Nudge uses AGENT_CONFIG_START/END markers (not old PROMPT_TEMPLATE)

Eval test (real LLM + Langfuse scoring)
----------------------------------------
  4.1   Journey start explores directory → first reply contains a question

Cases 4.2–4.5 (multi-turn conversations producing a full ScoutConfig) are
non-deterministic and tested manually — results tracked in Langfuse.

Run:
    pytest tests/test_journey_v2.py -v
    pytest tests/test_journey_v2.py -v -k "4_6"          # unit only
    pytest tests/test_journey_v2.py -v -k "eval"          # single LLM eval
    pytest tests/test_journey_v2.py -v --journey-dir /p   # custom fixtures
"""

from __future__ import annotations

import uuid
from contextlib import nullcontext
from pathlib import Path
from typing import Any
from unittest.mock import patch

import pytest
import yaml

from app.api.routes.scout_setup import (
    _CONFIG_END,
    _CONFIG_START,
    _MAX_TURNS,
    _extract_agent_config,
    _sessions,
    handle_journey_message,
    handle_journey_start,
)
from app.core.langfuse_client import get_langfuse
from app.core.ws_context import clear_client_executor, set_client_executor
from app.schemas import ScoutConfig
from tests.conftest import TEST_USER_IDS

# ── Constants ─────────────────────────────────────────────────────────────

_USER_ID = TEST_USER_IDS["power"]

_DEFAULT_FIXTURE_DIR = Path(__file__).parent / "fixtures" / "journey_v2"

# ── Fixture loading ───────────────────────────────────────────────────────


def _fixtures_dir(config) -> Path:
    override = config.getoption("--journey-dir")
    return Path(override) if override else _DEFAULT_FIXTURE_DIR


def _load_cases(config) -> list[dict]:
    return yaml.safe_load(
        (_fixtures_dir(config) / "cases.yaml").read_text(encoding="utf-8")
    )


def _read_data_file(filename: str, fixtures_dir: Path) -> str:
    return (fixtures_dir / "data" / filename).read_text(encoding="utf-8")


# ── pytest_generate_tests ─────────────────────────────────────────────────


def pytest_generate_tests(metafunc):
    if "journey_case" not in metafunc.fixturenames:
        return
    cases = _load_cases(metafunc.config)
    metafunc.parametrize("journey_case", cases, ids=[c["id"] for c in cases])


# ── Executor builder ──────────────────────────────────────────────────────


def _make_fs_executor(directory_files: list[dict], fixtures_dir: Path):
    """Return an async callback that simulates filesystem tool responses.

    Matches the signature expected by ``set_client_executor`` / ``execute_on_client``:
    receives the full ``payload`` dict and returns a result dict.

    ``directory_files`` is a list of ``{path, content_file}`` dicts;
    ``content_file`` is relative to ``fixtures_dir/data/``.
    """
    file_map: dict[str, str] = {
        entry["path"]: _read_data_file(entry["content_file"], fixtures_dir)
        for entry in directory_files
    }

    async def _executor(payload: dict) -> dict:
        action = payload.get("action", "")
        data = payload.get("data") or {}

        if action == "list_directory":
            return {"entries": [
                {"type": "file", "name": p.split("/")[-1], "path": p}
                for p in file_map
            ]}

        if action == "read_file_content":
            path = data.get("path", "")
            return {"content": file_map.get(path, "")}

        if action == "get_file_metadata":
            path = data.get("path", "")
            name = path.split("/")[-1]
            ext = "." + name.rsplit(".", 1)[-1] if "." in name else ""
            return {"name": name, "extension": ext, "size": 1024,
                    "createdAt": None, "modifiedAt": None}

        return {}

    return _executor


# ── Journey runner helper ─────────────────────────────────────────────────


async def _run_journey(user_id: str, case: dict, executor) -> dict[str, Any]:
    """Drive start + all user_messages for a case. Returns the final reply dict.

    Mirrors ``device_ws._handle_journey_start/message``: sets the client
    executor (so filesystem tools work) before each handler call.
    """
    session_id = str(uuid.uuid4())
    try:
        set_client_executor(executor)
        reply = await handle_journey_start(user_id, {
            "agent_type": "local",
            "directory": case["directory"],
            "data_types": case["data_types"],
            "session_id": session_id,
        })

        for msg in case.get("user_messages", []):
            if reply.get("done"):
                break
            set_client_executor(executor)
            reply = await handle_journey_message(user_id, {
                "session_id": reply["session_id"],
                "message": msg,
            })
    finally:
        clear_client_executor()
        _sessions.pop(session_id, None)

    return reply


# ── Assertion helper ──────────────────────────────────────────────────────


def _evaluate_case(case: dict, reply: dict) -> tuple[float, str]:
    """Return (score, comment) for a journey case given the final reply dict."""
    if case.get("expect_question"):
        has_q = "?" in reply.get("message", "")
        return (1.0 if has_q else 0.0), f"first_reply_has_question={has_q}"

    return 1.0, "no specific assertion"


# ── Unit tests ────────────────────────────────────────────────────────────


def test_4_6a_extract_valid_json():
    """_extract_agent_config: valid JSON between markers → returns serialised config."""
    config = ScoutConfig(
        content_types=[],
        global_rules=["No project = no entity"],
        data_types=["tasks"],
    )
    text = f"Some preamble\n{_CONFIG_START}\n{config.model_dump_json()}\n{_CONFIG_END}\nTrailing"
    result = _extract_agent_config(text)
    assert result is not None
    parsed = ScoutConfig.model_validate_json(result)
    assert parsed.global_rules == ["No project = no entity"]


def test_4_6b_extract_invalid_json():
    """_extract_agent_config: malformed JSON between markers → returns None."""
    text = f"{_CONFIG_START}\n{{not: valid json\n{_CONFIG_END}"
    assert _extract_agent_config(text) is None


def test_4_6c_extract_markers_absent():
    """_extract_agent_config: no markers at all → returns None."""
    assert _extract_agent_config("No markers here at all") is None


def test_4_6d_extract_only_start_marker():
    """_extract_agent_config: START without END → returns None."""
    assert _extract_agent_config(f"text {_CONFIG_START} no end marker") is None


@pytest.mark.asyncio
async def test_4_6e_session_not_found():
    """4.6e Session not found → done=True, agent_config=None, informative message."""
    reply = await handle_journey_message(_USER_ID, {
        "session_id": "nonexistent-session-id",
        "message": "Hello",
    })
    assert reply["done"] is True
    assert reply["agent_config"] is None
    assert "not found" in reply["message"].lower() or "expired" in reply["message"].lower()


@pytest.mark.asyncio
async def test_4_6f_nudge_uses_new_markers():
    """4.6f Nudge injected after max turns uses AGENT_CONFIG markers, not PROMPT_TEMPLATE."""
    session_id = str(uuid.uuid4())
    captured_histories: list[list[dict]] = []

    async def _mock_llm(system_prompt, history, tools, **kwargs) -> str:
        captured_histories.append(list(history))
        # Return plain text — no markers — to trigger the nudge path.
        return "I still need more information from you."

    from app.api.routes.scout_setup import JourneySession

    fake_session = JourneySession(
        session_id=session_id,
        user_id=_USER_ID,
        agent_type="local",
        directory="/test",
        data_types=["tasks"],
        system_prompt="system",
        langfuse_prompt=None,
    )
    # Fill history to the turn limit so the next message triggers the nudge.
    for i in range(_MAX_TURNS):
        fake_session.history.append({"role": "user", "content": f"msg {i}"})
        fake_session.history.append({"role": "assistant", "content": "ok"})
    _sessions[session_id] = fake_session

    try:
        with patch("app.api.routes.scout_setup._call_llm_with_tools", side_effect=_mock_llm):
            await handle_journey_message(_USER_ID, {
                "session_id": session_id,
                "message": "one more message to trigger nudge",
            })
    finally:
        _sessions.pop(session_id, None)

    # Second LLM call receives the nudge appended to history.
    assert len(captured_histories) >= 2, "Expected ≥ 2 LLM calls (main reply + nudge)"
    nudge_history = captured_histories[1]
    user_msgs = " ".join(t["content"] for t in nudge_history if t["role"] == "user")
    assert _CONFIG_START in user_msgs, f"Nudge must reference {_CONFIG_START}"
    assert _CONFIG_END in user_msgs, f"Nudge must reference {_CONFIG_END}"
    assert "PROMPT_TEMPLATE" not in user_msgs, "Old PROMPT_TEMPLATE markers must not appear in nudge"


# ── Eval tests (real LLM + Langfuse) ─────────────────────────────────────


@pytest.mark.asyncio
@pytest.mark.eval
async def test_eval_journey(journey_case, pytestconfig):
    """Parametrized eval test — one invocation per YAML case."""
    case: dict = journey_case
    fixtures_dir = _fixtures_dir(pytestconfig)
    executor = _make_fs_executor(case.get("directory_files", []), fixtures_dir)

    lf = get_langfuse()
    obs_ctx = lf.start_as_current_observation(
        name=f"eval-journey-{case['id']}-{case.get('score_name', 'unknown').replace('.', '-')}",
        metadata={"step": "4", "case_id": case["id"]},
    ) if lf else nullcontext()

    with obs_ctx as obs:
        reply = await _run_journey(_USER_ID, case, executor)
        score, comment = _evaluate_case(case, reply)

        if obs is not None:
            obs.score(
                name=case.get("score_name", f"journey.case_{case['id']}"),
                value=score,
                comment=comment,
            )

    if lf:
        lf.flush()

    assert score == 1.0, f"[{case['id']}] {case.get('description', '')} — {comment}"