Rename all Pydantic models referring to the scout subsystem: AgentConfig → ScoutConfig, ContentTypeConfig → ScoutContentTypeConfig, AgentCatalogItem → ScoutCatalogItem, AgentCreationCheckRequest/Response → ScoutCreationCheckRequest/Response, AgentTriggerRequest → ScoutTriggerRequest, AgentRunLogResponse → ScoutRunLogResponse. LLM-helper agent schemas in app/agents/* are untouched. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
300 lines
11 KiB
Python
300 lines
11 KiB
Python
"""Tests for Local Agent V2 journey setup (Step 4).
|
||
|
||
Covers the chatbot journey that produces a structured ScoutConfig JSON
|
||
instead of a freeform prompt_template string.
|
||
|
||
Unit tests (no LLM)
|
||
--------------------
|
||
4.6a _extract_agent_config: valid JSON → returns serialised config
|
||
4.6b _extract_agent_config: invalid JSON → returns None
|
||
4.6c _extract_agent_config: markers absent → returns None
|
||
4.6d _extract_agent_config: only START marker → returns None
|
||
4.6e Session not found → done=True, agent_config=None
|
||
4.6f Nudge uses AGENT_CONFIG_START/END markers (not old PROMPT_TEMPLATE)
|
||
|
||
Eval test (real LLM + Langfuse scoring)
|
||
----------------------------------------
|
||
4.1 Journey start explores directory → first reply contains a question
|
||
|
||
Cases 4.2–4.5 (multi-turn conversations producing a full ScoutConfig) are
|
||
non-deterministic and tested manually — results tracked in Langfuse.
|
||
|
||
Run:
|
||
pytest tests/test_journey_v2.py -v
|
||
pytest tests/test_journey_v2.py -v -k "4_6" # unit only
|
||
pytest tests/test_journey_v2.py -v -k "eval" # single LLM eval
|
||
pytest tests/test_journey_v2.py -v --journey-dir /p # custom fixtures
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import uuid
|
||
from contextlib import nullcontext
|
||
from pathlib import Path
|
||
from typing import Any
|
||
from unittest.mock import patch
|
||
|
||
import pytest
|
||
import yaml
|
||
|
||
from app.api.routes.scout_setup import (
|
||
_CONFIG_END,
|
||
_CONFIG_START,
|
||
_MAX_TURNS,
|
||
_extract_agent_config,
|
||
_sessions,
|
||
handle_journey_message,
|
||
handle_journey_start,
|
||
)
|
||
from app.core.langfuse_client import get_langfuse
|
||
from app.core.ws_context import clear_client_executor, set_client_executor
|
||
from app.schemas import ScoutConfig
|
||
from tests.conftest import TEST_USER_IDS
|
||
|
||
# ── Constants ─────────────────────────────────────────────────────────────
|
||
|
||
_USER_ID = TEST_USER_IDS["power"]
|
||
|
||
_DEFAULT_FIXTURE_DIR = Path(__file__).parent / "fixtures" / "journey_v2"
|
||
|
||
# ── Fixture loading ───────────────────────────────────────────────────────
|
||
|
||
|
||
def _fixtures_dir(config) -> Path:
|
||
override = config.getoption("--journey-dir")
|
||
return Path(override) if override else _DEFAULT_FIXTURE_DIR
|
||
|
||
|
||
def _load_cases(config) -> list[dict]:
|
||
return yaml.safe_load(
|
||
(_fixtures_dir(config) / "cases.yaml").read_text(encoding="utf-8")
|
||
)
|
||
|
||
|
||
def _read_data_file(filename: str, fixtures_dir: Path) -> str:
|
||
return (fixtures_dir / "data" / filename).read_text(encoding="utf-8")
|
||
|
||
|
||
# ── pytest_generate_tests ─────────────────────────────────────────────────
|
||
|
||
|
||
def pytest_generate_tests(metafunc):
|
||
if "journey_case" not in metafunc.fixturenames:
|
||
return
|
||
cases = _load_cases(metafunc.config)
|
||
metafunc.parametrize("journey_case", cases, ids=[c["id"] for c in cases])
|
||
|
||
|
||
# ── Executor builder ──────────────────────────────────────────────────────
|
||
|
||
|
||
def _make_fs_executor(directory_files: list[dict], fixtures_dir: Path):
|
||
"""Return an async callback that simulates filesystem tool responses.
|
||
|
||
Matches the signature expected by ``set_client_executor`` / ``execute_on_client``:
|
||
receives the full ``payload`` dict and returns a result dict.
|
||
|
||
``directory_files`` is a list of ``{path, content_file}`` dicts;
|
||
``content_file`` is relative to ``fixtures_dir/data/``.
|
||
"""
|
||
file_map: dict[str, str] = {
|
||
entry["path"]: _read_data_file(entry["content_file"], fixtures_dir)
|
||
for entry in directory_files
|
||
}
|
||
|
||
async def _executor(payload: dict) -> dict:
|
||
action = payload.get("action", "")
|
||
data = payload.get("data") or {}
|
||
|
||
if action == "list_directory":
|
||
return {"entries": [
|
||
{"type": "file", "name": p.split("/")[-1], "path": p}
|
||
for p in file_map
|
||
]}
|
||
|
||
if action == "read_file_content":
|
||
path = data.get("path", "")
|
||
return {"content": file_map.get(path, "")}
|
||
|
||
if action == "get_file_metadata":
|
||
path = data.get("path", "")
|
||
name = path.split("/")[-1]
|
||
ext = "." + name.rsplit(".", 1)[-1] if "." in name else ""
|
||
return {"name": name, "extension": ext, "size": 1024,
|
||
"createdAt": None, "modifiedAt": None}
|
||
|
||
return {}
|
||
|
||
return _executor
|
||
|
||
|
||
# ── Journey runner helper ─────────────────────────────────────────────────
|
||
|
||
|
||
async def _run_journey(user_id: str, case: dict, executor) -> dict[str, Any]:
|
||
"""Drive start + all user_messages for a case. Returns the final reply dict.
|
||
|
||
Mirrors ``device_ws._handle_journey_start/message``: sets the client
|
||
executor (so filesystem tools work) before each handler call.
|
||
"""
|
||
session_id = str(uuid.uuid4())
|
||
try:
|
||
set_client_executor(executor)
|
||
reply = await handle_journey_start(user_id, {
|
||
"agent_type": "local",
|
||
"directory": case["directory"],
|
||
"data_types": case["data_types"],
|
||
"session_id": session_id,
|
||
})
|
||
|
||
for msg in case.get("user_messages", []):
|
||
if reply.get("done"):
|
||
break
|
||
set_client_executor(executor)
|
||
reply = await handle_journey_message(user_id, {
|
||
"session_id": reply["session_id"],
|
||
"message": msg,
|
||
})
|
||
finally:
|
||
clear_client_executor()
|
||
_sessions.pop(session_id, None)
|
||
|
||
return reply
|
||
|
||
|
||
# ── Assertion helper ──────────────────────────────────────────────────────
|
||
|
||
|
||
def _evaluate_case(case: dict, reply: dict) -> tuple[float, str]:
|
||
"""Return (score, comment) for a journey case given the final reply dict."""
|
||
if case.get("expect_question"):
|
||
has_q = "?" in reply.get("message", "")
|
||
return (1.0 if has_q else 0.0), f"first_reply_has_question={has_q}"
|
||
|
||
return 1.0, "no specific assertion"
|
||
|
||
|
||
# ── Unit tests ────────────────────────────────────────────────────────────
|
||
|
||
|
||
def test_4_6a_extract_valid_json():
|
||
"""_extract_agent_config: valid JSON between markers → returns serialised config."""
|
||
config = ScoutConfig(
|
||
content_types=[],
|
||
global_rules=["No project = no entity"],
|
||
data_types=["tasks"],
|
||
)
|
||
text = f"Some preamble\n{_CONFIG_START}\n{config.model_dump_json()}\n{_CONFIG_END}\nTrailing"
|
||
result = _extract_agent_config(text)
|
||
assert result is not None
|
||
parsed = ScoutConfig.model_validate_json(result)
|
||
assert parsed.global_rules == ["No project = no entity"]
|
||
|
||
|
||
def test_4_6b_extract_invalid_json():
|
||
"""_extract_agent_config: malformed JSON between markers → returns None."""
|
||
text = f"{_CONFIG_START}\n{{not: valid json\n{_CONFIG_END}"
|
||
assert _extract_agent_config(text) is None
|
||
|
||
|
||
def test_4_6c_extract_markers_absent():
|
||
"""_extract_agent_config: no markers at all → returns None."""
|
||
assert _extract_agent_config("No markers here at all") is None
|
||
|
||
|
||
def test_4_6d_extract_only_start_marker():
|
||
"""_extract_agent_config: START without END → returns None."""
|
||
assert _extract_agent_config(f"text {_CONFIG_START} no end marker") is None
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_4_6e_session_not_found():
|
||
"""4.6e Session not found → done=True, agent_config=None, informative message."""
|
||
reply = await handle_journey_message(_USER_ID, {
|
||
"session_id": "nonexistent-session-id",
|
||
"message": "Hello",
|
||
})
|
||
assert reply["done"] is True
|
||
assert reply["agent_config"] is None
|
||
assert "not found" in reply["message"].lower() or "expired" in reply["message"].lower()
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_4_6f_nudge_uses_new_markers():
|
||
"""4.6f Nudge injected after max turns uses AGENT_CONFIG markers, not PROMPT_TEMPLATE."""
|
||
session_id = str(uuid.uuid4())
|
||
captured_histories: list[list[dict]] = []
|
||
|
||
async def _mock_llm(system_prompt, history, tools, **kwargs) -> str:
|
||
captured_histories.append(list(history))
|
||
# Return plain text — no markers — to trigger the nudge path.
|
||
return "I still need more information from you."
|
||
|
||
from app.api.routes.scout_setup import JourneySession
|
||
|
||
fake_session = JourneySession(
|
||
session_id=session_id,
|
||
user_id=_USER_ID,
|
||
agent_type="local",
|
||
directory="/test",
|
||
data_types=["tasks"],
|
||
system_prompt="system",
|
||
langfuse_prompt=None,
|
||
)
|
||
# Fill history to the turn limit so the next message triggers the nudge.
|
||
for i in range(_MAX_TURNS):
|
||
fake_session.history.append({"role": "user", "content": f"msg {i}"})
|
||
fake_session.history.append({"role": "assistant", "content": "ok"})
|
||
_sessions[session_id] = fake_session
|
||
|
||
try:
|
||
with patch("app.api.routes.scout_setup._call_llm_with_tools", side_effect=_mock_llm):
|
||
await handle_journey_message(_USER_ID, {
|
||
"session_id": session_id,
|
||
"message": "one more message to trigger nudge",
|
||
})
|
||
finally:
|
||
_sessions.pop(session_id, None)
|
||
|
||
# Second LLM call receives the nudge appended to history.
|
||
assert len(captured_histories) >= 2, "Expected ≥ 2 LLM calls (main reply + nudge)"
|
||
nudge_history = captured_histories[1]
|
||
user_msgs = " ".join(t["content"] for t in nudge_history if t["role"] == "user")
|
||
assert _CONFIG_START in user_msgs, f"Nudge must reference {_CONFIG_START}"
|
||
assert _CONFIG_END in user_msgs, f"Nudge must reference {_CONFIG_END}"
|
||
assert "PROMPT_TEMPLATE" not in user_msgs, "Old PROMPT_TEMPLATE markers must not appear in nudge"
|
||
|
||
|
||
# ── Eval tests (real LLM + Langfuse) ─────────────────────────────────────
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
@pytest.mark.eval
|
||
async def test_eval_journey(journey_case, pytestconfig):
|
||
"""Parametrized eval test — one invocation per YAML case."""
|
||
case: dict = journey_case
|
||
fixtures_dir = _fixtures_dir(pytestconfig)
|
||
executor = _make_fs_executor(case.get("directory_files", []), fixtures_dir)
|
||
|
||
lf = get_langfuse()
|
||
obs_ctx = lf.start_as_current_observation(
|
||
name=f"eval-journey-{case['id']}-{case.get('score_name', 'unknown').replace('.', '-')}",
|
||
metadata={"step": "4", "case_id": case["id"]},
|
||
) if lf else nullcontext()
|
||
|
||
with obs_ctx as obs:
|
||
reply = await _run_journey(_USER_ID, case, executor)
|
||
score, comment = _evaluate_case(case, reply)
|
||
|
||
if obs is not None:
|
||
obs.score(
|
||
name=case.get("score_name", f"journey.case_{case['id']}"),
|
||
value=score,
|
||
comment=comment,
|
||
)
|
||
|
||
if lf:
|
||
lf.flush()
|
||
|
||
assert score == 1.0, f"[{case['id']}] {case.get('description', '')} — {comment}"
|