"""Tests for Local Agent V2 journey setup (Step 4). Covers the chatbot journey that produces a structured AgentConfig JSON instead of a freeform prompt_template string. Unit tests (no LLM) -------------------- 4.6a _extract_agent_config: valid JSON → returns serialised config 4.6b _extract_agent_config: invalid JSON → returns None 4.6c _extract_agent_config: markers absent → returns None 4.6d _extract_agent_config: only START marker → returns None 4.6e Session not found → done=True, agent_config=None 4.6f Nudge uses AGENT_CONFIG_START/END markers (not old PROMPT_TEMPLATE) Eval test (real LLM + Langfuse scoring) ---------------------------------------- 4.1 Journey start explores directory → first reply contains a question Cases 4.2–4.5 (multi-turn conversations producing a full AgentConfig) are non-deterministic and tested manually — results tracked in Langfuse. Run: pytest tests/test_journey_v2.py -v pytest tests/test_journey_v2.py -v -k "4_6" # unit only pytest tests/test_journey_v2.py -v -k "eval" # single LLM eval pytest tests/test_journey_v2.py -v --journey-dir /p # custom fixtures """ from __future__ import annotations import uuid from contextlib import nullcontext from pathlib import Path from typing import Any from unittest.mock import patch import pytest import yaml from app.api.routes.agent_setup import ( _CONFIG_END, _CONFIG_START, _MAX_TURNS, _extract_agent_config, _sessions, handle_journey_message, handle_journey_start, ) from app.core.langfuse_client import get_langfuse from app.core.ws_context import clear_client_executor, set_client_executor from app.schemas import AgentConfig from tests.conftest import TEST_USER_IDS # ── Constants ───────────────────────────────────────────────────────────── _USER_ID = TEST_USER_IDS["power"] _DEFAULT_FIXTURE_DIR = Path(__file__).parent / "fixtures" / "journey_v2" # ── Fixture loading ─────────────────────────────────────────────────────── def _fixtures_dir(config) -> Path: override = config.getoption("--journey-dir") return Path(override) if override else _DEFAULT_FIXTURE_DIR def _load_cases(config) -> list[dict]: return yaml.safe_load( (_fixtures_dir(config) / "cases.yaml").read_text(encoding="utf-8") ) def _read_data_file(filename: str, fixtures_dir: Path) -> str: return (fixtures_dir / "data" / filename).read_text(encoding="utf-8") # ── pytest_generate_tests ───────────────────────────────────────────────── def pytest_generate_tests(metafunc): if "journey_case" not in metafunc.fixturenames: return cases = _load_cases(metafunc.config) metafunc.parametrize("journey_case", cases, ids=[c["id"] for c in cases]) # ── Executor builder ────────────────────────────────────────────────────── def _make_fs_executor(directory_files: list[dict], fixtures_dir: Path): """Return an async callback that simulates filesystem tool responses. Matches the signature expected by ``set_client_executor`` / ``execute_on_client``: receives the full ``payload`` dict and returns a result dict. ``directory_files`` is a list of ``{path, content_file}`` dicts; ``content_file`` is relative to ``fixtures_dir/data/``. """ file_map: dict[str, str] = { entry["path"]: _read_data_file(entry["content_file"], fixtures_dir) for entry in directory_files } async def _executor(payload: dict) -> dict: action = payload.get("action", "") data = payload.get("data") or {} if action == "list_directory": return {"entries": [ {"type": "file", "name": p.split("/")[-1], "path": p} for p in file_map ]} if action == "read_file_content": path = data.get("path", "") return {"content": file_map.get(path, "")} if action == "get_file_metadata": path = data.get("path", "") name = path.split("/")[-1] ext = "." + name.rsplit(".", 1)[-1] if "." in name else "" return {"name": name, "extension": ext, "size": 1024, "createdAt": None, "modifiedAt": None} return {} return _executor # ── Journey runner helper ───────────────────────────────────────────────── async def _run_journey(user_id: str, case: dict, executor) -> dict[str, Any]: """Drive start + all user_messages for a case. Returns the final reply dict. Mirrors ``device_ws._handle_journey_start/message``: sets the client executor (so filesystem tools work) before each handler call. """ session_id = str(uuid.uuid4()) try: set_client_executor(executor) reply = await handle_journey_start(user_id, { "agent_type": "local", "directory": case["directory"], "data_types": case["data_types"], "session_id": session_id, }) for msg in case.get("user_messages", []): if reply.get("done"): break set_client_executor(executor) reply = await handle_journey_message(user_id, { "session_id": reply["session_id"], "message": msg, }) finally: clear_client_executor() _sessions.pop(session_id, None) return reply # ── Assertion helper ────────────────────────────────────────────────────── def _evaluate_case(case: dict, reply: dict) -> tuple[float, str]: """Return (score, comment) for a journey case given the final reply dict.""" if case.get("expect_question"): has_q = "?" in reply.get("message", "") return (1.0 if has_q else 0.0), f"first_reply_has_question={has_q}" return 1.0, "no specific assertion" # ── Unit tests ──────────────────────────────────────────────────────────── def test_4_6a_extract_valid_json(): """_extract_agent_config: valid JSON between markers → returns serialised config.""" config = AgentConfig( content_types=[], global_rules=["No project = no entity"], data_types=["tasks"], ) text = f"Some preamble\n{_CONFIG_START}\n{config.model_dump_json()}\n{_CONFIG_END}\nTrailing" result = _extract_agent_config(text) assert result is not None parsed = AgentConfig.model_validate_json(result) assert parsed.global_rules == ["No project = no entity"] def test_4_6b_extract_invalid_json(): """_extract_agent_config: malformed JSON between markers → returns None.""" text = f"{_CONFIG_START}\n{{not: valid json\n{_CONFIG_END}" assert _extract_agent_config(text) is None def test_4_6c_extract_markers_absent(): """_extract_agent_config: no markers at all → returns None.""" assert _extract_agent_config("No markers here at all") is None def test_4_6d_extract_only_start_marker(): """_extract_agent_config: START without END → returns None.""" assert _extract_agent_config(f"text {_CONFIG_START} no end marker") is None @pytest.mark.asyncio async def test_4_6e_session_not_found(): """4.6e Session not found → done=True, agent_config=None, informative message.""" reply = await handle_journey_message(_USER_ID, { "session_id": "nonexistent-session-id", "message": "Hello", }) assert reply["done"] is True assert reply["agent_config"] is None assert "not found" in reply["message"].lower() or "expired" in reply["message"].lower() @pytest.mark.asyncio async def test_4_6f_nudge_uses_new_markers(): """4.6f Nudge injected after max turns uses AGENT_CONFIG markers, not PROMPT_TEMPLATE.""" session_id = str(uuid.uuid4()) captured_histories: list[list[dict]] = [] async def _mock_llm(system_prompt, history, tools, **kwargs) -> str: captured_histories.append(list(history)) # Return plain text — no markers — to trigger the nudge path. return "I still need more information from you." from app.api.routes.agent_setup import JourneySession fake_session = JourneySession( session_id=session_id, user_id=_USER_ID, agent_type="local", directory="/test", data_types=["tasks"], system_prompt="system", langfuse_prompt=None, ) # Fill history to the turn limit so the next message triggers the nudge. for i in range(_MAX_TURNS): fake_session.history.append({"role": "user", "content": f"msg {i}"}) fake_session.history.append({"role": "assistant", "content": "ok"}) _sessions[session_id] = fake_session try: with patch("app.api.routes.agent_setup._call_llm_with_tools", side_effect=_mock_llm): await handle_journey_message(_USER_ID, { "session_id": session_id, "message": "one more message to trigger nudge", }) finally: _sessions.pop(session_id, None) # Second LLM call receives the nudge appended to history. assert len(captured_histories) >= 2, "Expected ≥ 2 LLM calls (main reply + nudge)" nudge_history = captured_histories[1] user_msgs = " ".join(t["content"] for t in nudge_history if t["role"] == "user") assert _CONFIG_START in user_msgs, f"Nudge must reference {_CONFIG_START}" assert _CONFIG_END in user_msgs, f"Nudge must reference {_CONFIG_END}" assert "PROMPT_TEMPLATE" not in user_msgs, "Old PROMPT_TEMPLATE markers must not appear in nudge" # ── Eval tests (real LLM + Langfuse) ───────────────────────────────────── @pytest.mark.asyncio @pytest.mark.eval async def test_eval_journey(journey_case, pytestconfig): """Parametrized eval test — one invocation per YAML case.""" case: dict = journey_case fixtures_dir = _fixtures_dir(pytestconfig) executor = _make_fs_executor(case.get("directory_files", []), fixtures_dir) lf = get_langfuse() obs_ctx = lf.start_as_current_observation( name=f"eval-journey-{case['id']}-{case.get('score_name', 'unknown').replace('.', '-')}", metadata={"step": "4", "case_id": case["id"]}, ) if lf else nullcontext() with obs_ctx as obs: reply = await _run_journey(_USER_ID, case, executor) score, comment = _evaluate_case(case, reply) if obs is not None: obs.score( name=case.get("score_name", f"journey.case_{case['id']}"), value=score, comment=comment, ) if lf: lf.flush() assert score == 1.0, f"[{case['id']}] {case.get('description', '')} — {comment}"