Replace freeform prompt_template output with validated AgentConfig JSON: - agent_setup.py: new system prompt (journey_system_v2), AGENT_CONFIG_START/END markers, _extract_agent_config() with Pydantic validation, updated handlers returning agent_config key; import AgentConfig from schemas - tests/test_journey_v2.py: 6 unit tests + 5 parametrized LLM eval cases following test_agent_runner_v2.py pattern; _run_journey uses set_client_executor/clear_client_executor mirroring device_ws - tests/fixtures/journey_v2/: cases.yaml + email_action.html + email_info.html - tests/conftest.py: add --journey-dir CLI option; remove S3/plugin fixtures (cleanup from microservices migration, already present in working tree) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
350 lines
13 KiB
Python
350 lines
13 KiB
Python
"""Tests for Local Agent V2 journey setup (Step 4).
|
|
|
|
Covers the chatbot journey that produces a structured AgentConfig JSON
|
|
instead of a freeform prompt_template string.
|
|
|
|
Unit tests (no LLM)
|
|
--------------------
|
|
4.6a _extract_agent_config: valid JSON → returns serialised config
|
|
4.6b _extract_agent_config: invalid JSON → returns None
|
|
4.6c _extract_agent_config: markers absent → returns None
|
|
4.6d _extract_agent_config: only START marker → returns None
|
|
4.6e Session not found → done=True, agent_config=None
|
|
4.6f Nudge uses AGENT_CONFIG_START/END markers (not old PROMPT_TEMPLATE)
|
|
|
|
Eval tests (real LLM + Langfuse scoring)
|
|
-----------------------------------------
|
|
Cases are defined in tests/fixtures/journey_v2/cases.yaml.
|
|
Email HTML files live in tests/fixtures/journey_v2/data/.
|
|
Use --journey-dir to point at a custom folder (same structure required).
|
|
|
|
Run:
|
|
pytest tests/test_journey_v2.py -v
|
|
pytest tests/test_journey_v2.py -v -k "4_6" # unit only
|
|
pytest tests/test_journey_v2.py -v -k "eval" # LLM evals only
|
|
pytest tests/test_journey_v2.py -v --journey-dir /p # custom fixtures
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import uuid
|
|
from contextlib import nullcontext
|
|
from pathlib import Path
|
|
from typing import Any
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
import yaml
|
|
|
|
from app.api.routes.agent_setup import (
|
|
_CONFIG_END,
|
|
_CONFIG_START,
|
|
_MAX_TURNS,
|
|
_extract_agent_config,
|
|
_sessions,
|
|
handle_journey_message,
|
|
handle_journey_start,
|
|
)
|
|
from app.core.langfuse_client import get_langfuse
|
|
from app.core.ws_context import clear_client_executor, set_client_executor
|
|
from app.schemas import AgentConfig
|
|
from tests.conftest import TEST_USER_IDS
|
|
|
|
# ── Constants ─────────────────────────────────────────────────────────────
|
|
|
|
_USER_ID = TEST_USER_IDS["power"]
|
|
|
|
_DEFAULT_FIXTURE_DIR = Path(__file__).parent / "fixtures" / "journey_v2"
|
|
|
|
# ── Fixture loading ───────────────────────────────────────────────────────
|
|
|
|
|
|
def _fixtures_dir(config) -> Path:
|
|
override = config.getoption("--journey-dir")
|
|
return Path(override) if override else _DEFAULT_FIXTURE_DIR
|
|
|
|
|
|
def _load_cases(config) -> list[dict]:
|
|
return yaml.safe_load(
|
|
(_fixtures_dir(config) / "cases.yaml").read_text(encoding="utf-8")
|
|
)
|
|
|
|
|
|
def _read_data_file(filename: str, fixtures_dir: Path) -> str:
|
|
return (fixtures_dir / "data" / filename).read_text(encoding="utf-8")
|
|
|
|
|
|
# ── pytest_generate_tests ─────────────────────────────────────────────────
|
|
|
|
|
|
def pytest_generate_tests(metafunc):
|
|
if "journey_case" not in metafunc.fixturenames:
|
|
return
|
|
cases = _load_cases(metafunc.config)
|
|
metafunc.parametrize("journey_case", cases, ids=[c["id"] for c in cases])
|
|
|
|
|
|
# ── Executor builder ──────────────────────────────────────────────────────
|
|
|
|
|
|
def _make_fs_executor(directory_files: list[dict], fixtures_dir: Path):
|
|
"""Return an async callback that simulates filesystem tool responses.
|
|
|
|
Matches the signature expected by ``set_client_executor`` / ``execute_on_client``:
|
|
receives the full ``payload`` dict and returns a result dict.
|
|
|
|
``directory_files`` is a list of ``{path, content_file}`` dicts;
|
|
``content_file`` is relative to ``fixtures_dir/data/``.
|
|
"""
|
|
file_map: dict[str, str] = {
|
|
entry["path"]: _read_data_file(entry["content_file"], fixtures_dir)
|
|
for entry in directory_files
|
|
}
|
|
|
|
async def _executor(payload: dict) -> dict:
|
|
action = payload.get("action", "")
|
|
data = payload.get("data") or {}
|
|
|
|
if action == "list_directory":
|
|
return {"entries": [
|
|
{"type": "file", "name": p.split("/")[-1], "path": p}
|
|
for p in file_map
|
|
]}
|
|
|
|
if action == "read_file_content":
|
|
path = data.get("path", "")
|
|
return {"content": file_map.get(path, "")}
|
|
|
|
if action == "get_file_metadata":
|
|
path = data.get("path", "")
|
|
name = path.split("/")[-1]
|
|
ext = "." + name.rsplit(".", 1)[-1] if "." in name else ""
|
|
return {"name": name, "extension": ext, "size": 1024,
|
|
"createdAt": None, "modifiedAt": None}
|
|
|
|
return {}
|
|
|
|
return _executor
|
|
|
|
|
|
# ── Journey runner helper ─────────────────────────────────────────────────
|
|
|
|
|
|
async def _run_journey(user_id: str, case: dict, executor) -> dict[str, Any]:
|
|
"""Drive start + all user_messages for a case. Returns the final reply dict.
|
|
|
|
Mirrors ``device_ws._handle_journey_start/message``: sets the client
|
|
executor (so filesystem tools work) before each handler call.
|
|
"""
|
|
session_id = str(uuid.uuid4())
|
|
try:
|
|
set_client_executor(executor)
|
|
reply = await handle_journey_start(user_id, {
|
|
"agent_type": "local",
|
|
"directory": case["directory"],
|
|
"data_types": case["data_types"],
|
|
"session_id": session_id,
|
|
})
|
|
|
|
for msg in case.get("user_messages", []):
|
|
if reply.get("done"):
|
|
break
|
|
set_client_executor(executor)
|
|
reply = await handle_journey_message(user_id, {
|
|
"session_id": reply["session_id"],
|
|
"message": msg,
|
|
})
|
|
finally:
|
|
clear_client_executor()
|
|
_sessions.pop(session_id, None)
|
|
|
|
return reply
|
|
|
|
|
|
# ── Assertion helper ──────────────────────────────────────────────────────
|
|
|
|
|
|
def _evaluate_case(case: dict, reply: dict) -> tuple[float, str]:
|
|
"""Return (score, comment) for a journey case given the final reply dict."""
|
|
if case.get("expect_question"):
|
|
has_q = "?" in reply.get("message", "")
|
|
return (1.0 if has_q else 0.0), f"first_reply_has_question={has_q}"
|
|
|
|
if case.get("expect_done") and not reply.get("done"):
|
|
return 0.0, "expected done=True but journey did not complete"
|
|
|
|
agent_config_raw = reply.get("agent_config")
|
|
|
|
if case.get("expect_valid_config"):
|
|
if not agent_config_raw:
|
|
return 0.0, "agent_config is None"
|
|
try:
|
|
parsed = AgentConfig.model_validate_json(agent_config_raw)
|
|
valid = len(parsed.content_types) > 0
|
|
return (1.0 if valid else 0.0), f"content_types={len(parsed.content_types)}"
|
|
except Exception as exc:
|
|
return 0.0, f"parse error: {exc}"
|
|
|
|
if case.get("expect_content_type_id"):
|
|
expected_id = case["expect_content_type_id"]
|
|
if not agent_config_raw:
|
|
return 0.0, "agent_config is None"
|
|
try:
|
|
parsed = AgentConfig.model_validate_json(agent_config_raw)
|
|
ids = [ct.id for ct in parsed.content_types]
|
|
found = expected_id in ids
|
|
return (1.0 if found else 0.0), f"content_type_ids={ids}, expected={expected_id}"
|
|
except Exception as exc:
|
|
return 0.0, f"parse error: {exc}"
|
|
|
|
if case.get("expect_extraction_contains"):
|
|
keyword = case["expect_extraction_contains"].lower()
|
|
if not agent_config_raw:
|
|
return 0.0, "agent_config is None"
|
|
try:
|
|
parsed = AgentConfig.model_validate_json(agent_config_raw)
|
|
if not parsed.content_types:
|
|
return 0.0, "no content_types in config"
|
|
prompt = parsed.content_types[0].extraction_prompt.lower()
|
|
found = keyword in prompt
|
|
return (1.0 if found else 0.0), f"keyword='{keyword}' in extraction_prompt={found}"
|
|
except Exception as exc:
|
|
return 0.0, f"parse error: {exc}"
|
|
|
|
if case.get("expect_global_rules"):
|
|
if not agent_config_raw:
|
|
return 0.0, "agent_config is None"
|
|
try:
|
|
parsed = AgentConfig.model_validate_json(agent_config_raw)
|
|
has_rules = len(parsed.global_rules) > 0
|
|
return (1.0 if has_rules else 0.0), f"global_rules={parsed.global_rules}"
|
|
except Exception as exc:
|
|
return 0.0, f"parse error: {exc}"
|
|
|
|
return 1.0, "no specific assertion"
|
|
|
|
|
|
# ── Unit tests ────────────────────────────────────────────────────────────
|
|
|
|
|
|
def test_4_6a_extract_valid_json():
|
|
"""_extract_agent_config: valid JSON between markers → returns serialised config."""
|
|
config = AgentConfig(
|
|
content_types=[],
|
|
global_rules=["No project = no entity"],
|
|
data_types=["tasks"],
|
|
)
|
|
text = f"Some preamble\n{_CONFIG_START}\n{config.model_dump_json()}\n{_CONFIG_END}\nTrailing"
|
|
result = _extract_agent_config(text)
|
|
assert result is not None
|
|
parsed = AgentConfig.model_validate_json(result)
|
|
assert parsed.global_rules == ["No project = no entity"]
|
|
|
|
|
|
def test_4_6b_extract_invalid_json():
|
|
"""_extract_agent_config: malformed JSON between markers → returns None."""
|
|
text = f"{_CONFIG_START}\n{{not: valid json\n{_CONFIG_END}"
|
|
assert _extract_agent_config(text) is None
|
|
|
|
|
|
def test_4_6c_extract_markers_absent():
|
|
"""_extract_agent_config: no markers at all → returns None."""
|
|
assert _extract_agent_config("No markers here at all") is None
|
|
|
|
|
|
def test_4_6d_extract_only_start_marker():
|
|
"""_extract_agent_config: START without END → returns None."""
|
|
assert _extract_agent_config(f"text {_CONFIG_START} no end marker") is None
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_4_6e_session_not_found():
|
|
"""4.6e Session not found → done=True, agent_config=None, informative message."""
|
|
reply = await handle_journey_message(_USER_ID, {
|
|
"session_id": "nonexistent-session-id",
|
|
"message": "Hello",
|
|
})
|
|
assert reply["done"] is True
|
|
assert reply["agent_config"] is None
|
|
assert "not found" in reply["message"].lower() or "expired" in reply["message"].lower()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_4_6f_nudge_uses_new_markers():
|
|
"""4.6f Nudge injected after max turns uses AGENT_CONFIG markers, not PROMPT_TEMPLATE."""
|
|
session_id = str(uuid.uuid4())
|
|
captured_histories: list[list[dict]] = []
|
|
|
|
async def _mock_llm(system_prompt, history, tools, **kwargs) -> str:
|
|
captured_histories.append(list(history))
|
|
# Return plain text — no markers — to trigger the nudge path.
|
|
return "I still need more information from you."
|
|
|
|
from app.api.routes.agent_setup import JourneySession
|
|
|
|
fake_session = JourneySession(
|
|
session_id=session_id,
|
|
user_id=_USER_ID,
|
|
agent_type="local",
|
|
directory="/test",
|
|
data_types=["tasks"],
|
|
system_prompt="system",
|
|
langfuse_prompt=None,
|
|
)
|
|
# Fill history to the turn limit so the next message triggers the nudge.
|
|
for i in range(_MAX_TURNS):
|
|
fake_session.history.append({"role": "user", "content": f"msg {i}"})
|
|
fake_session.history.append({"role": "assistant", "content": "ok"})
|
|
_sessions[session_id] = fake_session
|
|
|
|
try:
|
|
with patch("app.api.routes.agent_setup._call_llm_with_tools", side_effect=_mock_llm):
|
|
await handle_journey_message(_USER_ID, {
|
|
"session_id": session_id,
|
|
"message": "one more message to trigger nudge",
|
|
})
|
|
finally:
|
|
_sessions.pop(session_id, None)
|
|
|
|
# Second LLM call receives the nudge appended to history.
|
|
assert len(captured_histories) >= 2, "Expected ≥ 2 LLM calls (main reply + nudge)"
|
|
nudge_history = captured_histories[1]
|
|
user_msgs = " ".join(t["content"] for t in nudge_history if t["role"] == "user")
|
|
assert _CONFIG_START in user_msgs, f"Nudge must reference {_CONFIG_START}"
|
|
assert _CONFIG_END in user_msgs, f"Nudge must reference {_CONFIG_END}"
|
|
assert "PROMPT_TEMPLATE" not in user_msgs, "Old PROMPT_TEMPLATE markers must not appear in nudge"
|
|
|
|
|
|
# ── Eval tests (real LLM + Langfuse) ─────────────────────────────────────
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.eval
|
|
async def test_eval_journey(journey_case, pytestconfig):
|
|
"""Parametrized eval test — one invocation per YAML case."""
|
|
case: dict = journey_case
|
|
fixtures_dir = _fixtures_dir(pytestconfig)
|
|
executor = _make_fs_executor(case.get("directory_files", []), fixtures_dir)
|
|
|
|
lf = get_langfuse()
|
|
obs_ctx = lf.start_as_current_observation(
|
|
name=f"eval-journey-{case['id']}-{case.get('score_name', 'unknown').replace('.', '-')}",
|
|
metadata={"step": "4", "case_id": case["id"]},
|
|
) if lf else nullcontext()
|
|
|
|
with obs_ctx as obs:
|
|
reply = await _run_journey(_USER_ID, case, executor)
|
|
score, comment = _evaluate_case(case, reply)
|
|
|
|
if obs is not None:
|
|
obs.score(
|
|
name=case.get("score_name", f"journey.case_{case['id']}"),
|
|
value=score,
|
|
comment=comment,
|
|
)
|
|
|
|
if lf:
|
|
lf.flush()
|
|
|
|
assert score == 1.0, f"[{case['id']}] {case.get('description', '')} — {comment}"
|