431 lines
15 KiB
Python
431 lines
15 KiB
Python
"""Tests for Local Agent V2 runner (Step 2).
|
||
|
||
Covers the unified per-file flow:
|
||
Phase A — detect + preprocess (Python, zero LLM)
|
||
Phase B — single LLM call with tools (classify + extract + create)
|
||
|
||
Fixture-based eval tests (2.1–2.7)
|
||
-----------------------------------
|
||
Cases are defined in tests/fixtures/agent_runner_v2/cases.yaml.
|
||
Email HTML files live in tests/fixtures/agent_runner_v2/data/.
|
||
Use --runner-dir to point at a custom folder (same structure required).
|
||
|
||
Unit tests (no LLM)
|
||
--------------------
|
||
2.8 items_created count → items_created == N create_* calls
|
||
2.9 Device offline → status=error
|
||
2.10 Empty file → items_processed=0, status=success
|
||
|
||
Run:
|
||
pytest tests/test_agent_runner_v2.py -v
|
||
pytest tests/test_agent_runner_v2.py -v -k "2_9 or 2_10 or 2_8" # unit only
|
||
pytest tests/test_agent_runner_v2.py -v -k "eval" # LLM evals only
|
||
pytest tests/test_agent_runner_v2.py -v --runner-dir /path/to/dir # custom fixtures
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import uuid
|
||
from contextlib import nullcontext
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import Any
|
||
from unittest.mock import AsyncMock, MagicMock, patch
|
||
|
||
import pytest
|
||
import yaml
|
||
|
||
from app.core.scout_runner import (
|
||
_format_metadata,
|
||
_format_projects,
|
||
_get_extraction_rules,
|
||
_get_no_match_behavior,
|
||
run_local_agent,
|
||
)
|
||
from app.core.device_manager import DeviceConnectionManager
|
||
from app.core.langfuse_client import get_langfuse
|
||
from app.models import ScoutRunLog, LocalScoutConfig
|
||
from tests.conftest import TEST_USER_IDS
|
||
|
||
# ── Constants ─────────────────────────────────────────────────────────────
|
||
|
||
_USER_ID = TEST_USER_IDS["power"]
|
||
|
||
_DEFAULT_FIXTURE_DIR = Path(__file__).parent / "fixtures" / "agent_runner_v2"
|
||
|
||
_AGENT_CONFIG = {
|
||
"content_types": [
|
||
{
|
||
"id": "email_html",
|
||
"label": "Email HTML",
|
||
"detection_hint": "HTML file with From/To/Subject headers",
|
||
"preprocessing": "email_html",
|
||
"extraction_prompt": (
|
||
"If the email contains a direct action request or task assignment → create a task. "
|
||
"If the email contains informational content, updates, or FYI → create a note. "
|
||
"If the email mentions a specific date for a meeting or deadline → create a timeline entry."
|
||
),
|
||
}
|
||
],
|
||
"global_rules": [
|
||
"Se il file non è riconducibile a nessun progetto, non creare alcuna entità."
|
||
],
|
||
"data_types": ["tasks", "notes", "timelines"],
|
||
}
|
||
|
||
# Canonical project definitions, referenced symbolically in cases.yaml.
|
||
_PROJECTS: dict[str, dict] = {
|
||
"alpha": {"id": "proj-alpha", "name": "Project Alpha", "status": "active"},
|
||
"beta": {"id": "proj-beta", "name": "Project Beta", "status": "active"},
|
||
}
|
||
|
||
|
||
# ── Fixture loading ───────────────────────────────────────────────────────
|
||
|
||
|
||
def _fixtures_dir(config) -> Path:
|
||
override = config.getoption("--runner-dir")
|
||
return Path(override) if override else _DEFAULT_FIXTURE_DIR
|
||
|
||
|
||
def _load_cases(config) -> list[dict]:
|
||
return yaml.safe_load(
|
||
(_fixtures_dir(config) / "cases.yaml").read_text(encoding="utf-8")
|
||
)
|
||
|
||
|
||
def _read_case_file(case: dict, data_dir: Path) -> str:
|
||
return (data_dir / case["file"]).read_text(encoding="utf-8")
|
||
|
||
|
||
def _resolve_projects(entries: list[str | dict]) -> list[dict]:
|
||
"""Resolve project list from YAML: symbolic names and/or inline dicts."""
|
||
result = []
|
||
for entry in entries:
|
||
if isinstance(entry, str):
|
||
if entry in _PROJECTS:
|
||
result.append(_PROJECTS[entry])
|
||
elif isinstance(entry, dict):
|
||
result.append(entry)
|
||
return result
|
||
|
||
|
||
# ── pytest_generate_tests — parametrize eval tests from YAML ─────────────
|
||
|
||
|
||
def pytest_generate_tests(metafunc):
|
||
if "runner_case" not in metafunc.fixturenames:
|
||
return
|
||
cases = _load_cases(metafunc.config)
|
||
metafunc.parametrize("runner_case", cases, ids=[c["id"] for c in cases])
|
||
|
||
|
||
# ── Test helpers ──────────────────────────────────────────────────────────
|
||
|
||
|
||
def _make_config(
|
||
agent_config: dict | None = None,
|
||
directory: str = "/emails",
|
||
device_id: str = "dev-001",
|
||
) -> LocalScoutConfig:
|
||
return LocalScoutConfig(
|
||
id=str(uuid.uuid4()),
|
||
user_id=_USER_ID,
|
||
device_id=device_id,
|
||
name="Test V2 Agent",
|
||
directory_paths=[directory],
|
||
data_types=["tasks", "notes", "timelines"],
|
||
prompt_template="",
|
||
scout_config=agent_config or _AGENT_CONFIG,
|
||
file_extensions=[".html", ".eml"],
|
||
schedule_cron="0 */6 * * *",
|
||
enabled=True,
|
||
last_run_at=None,
|
||
)
|
||
|
||
|
||
def _make_run_log(agent_id: str) -> ScoutRunLog:
|
||
return ScoutRunLog(
|
||
id=str(uuid.uuid4()),
|
||
scout_id=agent_id,
|
||
scout_type="local",
|
||
user_id=_USER_ID,
|
||
status="running",
|
||
started_at=datetime.now(timezone.utc),
|
||
)
|
||
|
||
|
||
def _make_manager(online: bool = True) -> DeviceConnectionManager:
|
||
mgr = DeviceConnectionManager()
|
||
if online:
|
||
ws = MagicMock()
|
||
ws.send_text = AsyncMock()
|
||
mgr.register(_USER_ID, "dev-001", ws)
|
||
return mgr
|
||
|
||
|
||
def _make_executor(
|
||
file_path: str,
|
||
file_content: str,
|
||
projects: list[dict] | None = None,
|
||
existing_tasks: list[dict] | None = None,
|
||
existing_notes: list[dict] | None = None,
|
||
existing_timelines: list[dict] | None = None,
|
||
) -> tuple[Any, list[dict]]:
|
||
"""Return (async_executor, captured_calls).
|
||
|
||
The executor handles all ``execute_on_client`` payloads:
|
||
directory listing, file reading, project/entity fetching, and CRUD.
|
||
"""
|
||
calls: list[dict] = []
|
||
_projects = projects if projects is not None else list(_PROJECTS.values())
|
||
|
||
async def _executor(payload: dict) -> dict:
|
||
action = payload.get("action", "")
|
||
table = payload.get("table", "")
|
||
data = payload.get("data") or {}
|
||
calls.append({"action": action, "table": table, "data": data})
|
||
|
||
if action == "list_directory":
|
||
return {"entries": [{"type": "file", "path": file_path}]}
|
||
|
||
if action == "get_file_metadata":
|
||
return {"modifiedAt": None}
|
||
|
||
if action == "read_file_content":
|
||
return {"content": file_content}
|
||
|
||
if action == "select":
|
||
if table == "projects":
|
||
return {"rows": _projects}
|
||
if table == "tasks":
|
||
return {"rows": existing_tasks or []}
|
||
if table == "notes":
|
||
return {"rows": existing_notes or []}
|
||
if table == "timelines":
|
||
return {"rows": existing_timelines or []}
|
||
return {"rows": []}
|
||
|
||
if action == "insert":
|
||
return {"row": {"id": str(uuid.uuid4()), **data}}
|
||
|
||
if action == "update":
|
||
return {"success": True}
|
||
|
||
return {}
|
||
|
||
return _executor, calls
|
||
|
||
|
||
# ── Unit: helper functions ────────────────────────────────────────────────
|
||
|
||
|
||
def test_format_projects_empty():
|
||
assert "(no projects" in _format_projects([])
|
||
|
||
|
||
def test_format_projects_with_data():
|
||
result = _format_projects([_PROJECTS["alpha"]])
|
||
assert "proj-alpha" in result
|
||
assert "Project Alpha" in result
|
||
|
||
|
||
def test_format_metadata_empty():
|
||
assert _format_metadata({}) == ""
|
||
|
||
|
||
def test_format_metadata_email():
|
||
meta = {"subject": "Fix bug", "from": "boss@co.com", "date": "2026-04-07"}
|
||
result = _format_metadata(meta)
|
||
assert "Fix bug" in result
|
||
assert "boss@co.com" in result
|
||
|
||
|
||
def test_get_extraction_rules_match():
|
||
rules = _get_extraction_rules(_AGENT_CONFIG, "email_html")
|
||
assert "task" in rules.lower()
|
||
|
||
|
||
def test_get_extraction_rules_fallback():
|
||
rules = _get_extraction_rules(_AGENT_CONFIG, "plain_text")
|
||
assert "extract" in rules.lower()
|
||
|
||
|
||
def test_get_no_match_behavior_from_global_rules():
|
||
behavior = _get_no_match_behavior(_AGENT_CONFIG)
|
||
assert behavior # non-empty
|
||
|
||
|
||
def test_get_no_match_behavior_default():
|
||
behavior = _get_no_match_behavior({})
|
||
assert "project" in behavior.lower()
|
||
|
||
|
||
# ── Unit: 2.9 — device offline ───────────────────────────────────────────
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_2_9_device_offline():
|
||
"""2.9 No device online → status=error, no executor created."""
|
||
config = _make_config()
|
||
run_log = _make_run_log(config.id)
|
||
mgr = _make_manager(online=False)
|
||
|
||
with patch("app.core.scout_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
|
||
await run_local_agent(_USER_ID, config, run_log, mgr)
|
||
|
||
_, kwargs = mock_fin.call_args
|
||
assert kwargs["status"] == "error"
|
||
assert any("not connected" in e for e in kwargs.get("errors", []))
|
||
|
||
|
||
# ── Unit: 2.10 — empty file ──────────────────────────────────────────────
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_2_10_empty_file():
|
||
"""2.10 File with empty content → skipped, items_processed=0, success."""
|
||
config = _make_config()
|
||
run_log = _make_run_log(config.id)
|
||
mgr = _make_manager()
|
||
|
||
executor, calls = _make_executor(
|
||
file_path="/emails/empty.html",
|
||
file_content="",
|
||
projects=[_PROJECTS["alpha"]],
|
||
)
|
||
|
||
with patch("app.core.scout_runner._make_agent_executor", return_value=executor), \
|
||
patch("app.core.scout_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
|
||
await run_local_agent(_USER_ID, config, run_log, mgr)
|
||
|
||
_, kwargs = mock_fin.call_args
|
||
assert kwargs["items_processed"] == 0
|
||
assert kwargs["status"] == "success"
|
||
assert kwargs["items_created"] == 0
|
||
|
||
|
||
# ── Unit: 2.8 — items_created count ─────────────────────────────────────
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_2_8_items_created_count():
|
||
"""2.8 items_created == number of create_* tool calls per run."""
|
||
config = _make_config()
|
||
run_log = _make_run_log(config.id)
|
||
mgr = _make_manager()
|
||
|
||
executor, _calls = _make_executor(
|
||
file_path="/emails/action.html",
|
||
file_content="<html><body><p>Fix the login bug in Project Alpha.</p></body></html>",
|
||
projects=[_PROJECTS["alpha"]],
|
||
)
|
||
|
||
async def mock_run_agent(*, _tool_calls_out=None, **kw) -> str:
|
||
if _tool_calls_out is not None:
|
||
_tool_calls_out.extend(["create_task", "create_note", "update_task"])
|
||
return "Done."
|
||
|
||
with patch("app.core.scout_runner._make_agent_executor", return_value=executor), \
|
||
patch("app.core.scout_runner._run_agent_with_tools", side_effect=mock_run_agent), \
|
||
patch("app.core.scout_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
|
||
await run_local_agent(_USER_ID, config, run_log, mgr)
|
||
|
||
_, kwargs = mock_fin.call_args
|
||
# Only create_task + create_note count (not update_task).
|
||
assert kwargs["items_created"] == 2
|
||
assert kwargs["items_processed"] == 1
|
||
|
||
|
||
# ── Eval: 2.1–2.7 — fixture-driven, real LLM + Langfuse scoring ──────────
|
||
#
|
||
# Cases loaded from tests/fixtures/agent_runner_v2/cases.yaml.
|
||
# Supported assertions (from YAML):
|
||
# expect_insert: <table> → at least 1 insert in that table
|
||
# expect_no_insert: true → zero inserts in any table
|
||
# expect_project_id: <id> → any insert carries this projectId
|
||
# expect_dedup: true → task inserts == 0 OR task updates >= 1
|
||
# ─────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
@pytest.mark.eval
|
||
async def test_eval_runner(runner_case, pytestconfig):
|
||
"""Parametrized eval test — one invocation per YAML case."""
|
||
case: dict = runner_case
|
||
data_dir = _fixtures_dir(pytestconfig) / "data"
|
||
file_content = _read_case_file(case, data_dir)
|
||
projects = _resolve_projects(case.get("projects", []))
|
||
|
||
config = _make_config()
|
||
run_log = _make_run_log(config.id)
|
||
mgr = _make_manager()
|
||
|
||
executor, calls = _make_executor(
|
||
file_path=case["file_path"],
|
||
file_content=file_content,
|
||
projects=projects,
|
||
existing_tasks=case.get("existing_tasks"),
|
||
existing_notes=case.get("existing_notes"),
|
||
existing_timelines=case.get("existing_timelines"),
|
||
)
|
||
|
||
lf = get_langfuse()
|
||
obs_ctx = lf.start_as_current_observation(
|
||
name=f"eval-runner-{case['id']}-{case.get('score_name', 'unknown').replace('.', '-')}",
|
||
metadata={"step": "2", "case_id": case["id"]},
|
||
) if lf else nullcontext()
|
||
|
||
with obs_ctx as obs:
|
||
with patch("app.core.scout_runner._make_agent_executor", return_value=executor), \
|
||
patch("app.core.scout_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
|
||
await run_local_agent(_USER_ID, config, run_log, mgr)
|
||
|
||
_, kwargs = mock_fin.call_args
|
||
score, comment = _evaluate_case(case, calls, kwargs)
|
||
|
||
if obs is not None:
|
||
obs.score(
|
||
name=case.get("score_name", f"runner.case_{case['id']}"),
|
||
value=score,
|
||
comment=comment,
|
||
)
|
||
|
||
if lf:
|
||
lf.flush()
|
||
|
||
assert score == 1.0, f"[{case['id']}] {case.get('description', '')} — {comment}"
|
||
|
||
|
||
def _evaluate_case(case: dict, calls: list[dict], finalize_kwargs: dict) -> tuple[float, str]:
|
||
"""Return (score, comment) for a YAML case given the captured executor calls."""
|
||
inserts = [c for c in calls if c["action"] == "insert"]
|
||
|
||
if case.get("expect_no_insert"):
|
||
score = 1.0 if len(inserts) == 0 else 0.0
|
||
return score, f"inserts={len(inserts)} (expected 0)"
|
||
|
||
if "expect_insert" in case:
|
||
tables = case["expect_insert"]
|
||
if isinstance(tables, str):
|
||
tables = [tables]
|
||
missing = [t for t in tables if not any(c["table"] == t for c in inserts)]
|
||
score = 1.0 if not missing else 0.0
|
||
counts = {t: sum(1 for c in inserts if c["table"] == t) for t in tables}
|
||
return score, f"inserts={counts}" + (f" missing={missing}" if missing else "")
|
||
|
||
if "expect_project_id" in case:
|
||
expected_pid = case["expect_project_id"]
|
||
correct = any(c.get("data", {}).get("projectId") == expected_pid for c in inserts)
|
||
score = 1.0 if correct else 0.0
|
||
all_pids = [c.get("data", {}).get("projectId") for c in inserts]
|
||
return score, f"projectIds={all_pids} (expected {expected_pid!r})"
|
||
|
||
if case.get("expect_dedup"):
|
||
task_creates = [c for c in inserts if c["table"] == "tasks"]
|
||
task_updates = [c for c in calls if c["action"] == "update" and c["table"] == "tasks"]
|
||
score = 1.0 if len(task_creates) == 0 or len(task_updates) >= 1 else 0.0
|
||
return score, f"task_creates={len(task_creates)} task_updates={len(task_updates)}"
|
||
|
||
return 0.0, "no assertion defined in case"
|