Files
api/tests/test_agent_runner_v2.py

431 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Tests for Local Agent V2 runner (Step 2).
Covers the unified per-file flow:
Phase A — detect + preprocess (Python, zero LLM)
Phase B — single LLM call with tools (classify + extract + create)
Fixture-based eval tests (2.12.7)
-----------------------------------
Cases are defined in tests/fixtures/agent_runner_v2/cases.yaml.
Email HTML files live in tests/fixtures/agent_runner_v2/data/.
Use --runner-dir to point at a custom folder (same structure required).
Unit tests (no LLM)
--------------------
2.8 items_created count → items_created == N create_* calls
2.9 Device offline → status=error
2.10 Empty file → items_processed=0, status=success
Run:
pytest tests/test_agent_runner_v2.py -v
pytest tests/test_agent_runner_v2.py -v -k "2_9 or 2_10 or 2_8" # unit only
pytest tests/test_agent_runner_v2.py -v -k "eval" # LLM evals only
pytest tests/test_agent_runner_v2.py -v --runner-dir /path/to/dir # custom fixtures
"""
from __future__ import annotations
import uuid
from contextlib import nullcontext
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
import yaml
from app.core.scout_runner import (
_format_metadata,
_format_projects,
_get_extraction_rules,
_get_no_match_behavior,
run_local_agent,
)
from app.core.device_manager import DeviceConnectionManager
from app.core.langfuse_client import get_langfuse
from app.models import ScoutRunLog, LocalScoutConfig
from tests.conftest import TEST_USER_IDS
# ── Constants ─────────────────────────────────────────────────────────────
_USER_ID = TEST_USER_IDS["power"]
_DEFAULT_FIXTURE_DIR = Path(__file__).parent / "fixtures" / "agent_runner_v2"
_AGENT_CONFIG = {
"content_types": [
{
"id": "email_html",
"label": "Email HTML",
"detection_hint": "HTML file with From/To/Subject headers",
"preprocessing": "email_html",
"extraction_prompt": (
"If the email contains a direct action request or task assignment → create a task. "
"If the email contains informational content, updates, or FYI → create a note. "
"If the email mentions a specific date for a meeting or deadline → create a timeline entry."
),
}
],
"global_rules": [
"Se il file non è riconducibile a nessun progetto, non creare alcuna entità."
],
"data_types": ["tasks", "notes", "timelines"],
}
# Canonical project definitions, referenced symbolically in cases.yaml.
_PROJECTS: dict[str, dict] = {
"alpha": {"id": "proj-alpha", "name": "Project Alpha", "status": "active"},
"beta": {"id": "proj-beta", "name": "Project Beta", "status": "active"},
}
# ── Fixture loading ───────────────────────────────────────────────────────
def _fixtures_dir(config) -> Path:
override = config.getoption("--runner-dir")
return Path(override) if override else _DEFAULT_FIXTURE_DIR
def _load_cases(config) -> list[dict]:
return yaml.safe_load(
(_fixtures_dir(config) / "cases.yaml").read_text(encoding="utf-8")
)
def _read_case_file(case: dict, data_dir: Path) -> str:
return (data_dir / case["file"]).read_text(encoding="utf-8")
def _resolve_projects(entries: list[str | dict]) -> list[dict]:
"""Resolve project list from YAML: symbolic names and/or inline dicts."""
result = []
for entry in entries:
if isinstance(entry, str):
if entry in _PROJECTS:
result.append(_PROJECTS[entry])
elif isinstance(entry, dict):
result.append(entry)
return result
# ── pytest_generate_tests — parametrize eval tests from YAML ─────────────
def pytest_generate_tests(metafunc):
if "runner_case" not in metafunc.fixturenames:
return
cases = _load_cases(metafunc.config)
metafunc.parametrize("runner_case", cases, ids=[c["id"] for c in cases])
# ── Test helpers ──────────────────────────────────────────────────────────
def _make_config(
agent_config: dict | None = None,
directory: str = "/emails",
device_id: str = "dev-001",
) -> LocalScoutConfig:
return LocalScoutConfig(
id=str(uuid.uuid4()),
user_id=_USER_ID,
device_id=device_id,
name="Test V2 Agent",
directory_paths=[directory],
data_types=["tasks", "notes", "timelines"],
prompt_template="",
scout_config=agent_config or _AGENT_CONFIG,
file_extensions=[".html", ".eml"],
schedule_cron="0 */6 * * *",
enabled=True,
last_run_at=None,
)
def _make_run_log(agent_id: str) -> ScoutRunLog:
return ScoutRunLog(
id=str(uuid.uuid4()),
scout_id=agent_id,
scout_type="local",
user_id=_USER_ID,
status="running",
started_at=datetime.now(timezone.utc),
)
def _make_manager(online: bool = True) -> DeviceConnectionManager:
mgr = DeviceConnectionManager()
if online:
ws = MagicMock()
ws.send_text = AsyncMock()
mgr.register(_USER_ID, "dev-001", ws)
return mgr
def _make_executor(
file_path: str,
file_content: str,
projects: list[dict] | None = None,
existing_tasks: list[dict] | None = None,
existing_notes: list[dict] | None = None,
existing_timelines: list[dict] | None = None,
) -> tuple[Any, list[dict]]:
"""Return (async_executor, captured_calls).
The executor handles all ``execute_on_client`` payloads:
directory listing, file reading, project/entity fetching, and CRUD.
"""
calls: list[dict] = []
_projects = projects if projects is not None else list(_PROJECTS.values())
async def _executor(payload: dict) -> dict:
action = payload.get("action", "")
table = payload.get("table", "")
data = payload.get("data") or {}
calls.append({"action": action, "table": table, "data": data})
if action == "list_directory":
return {"entries": [{"type": "file", "path": file_path}]}
if action == "get_file_metadata":
return {"modifiedAt": None}
if action == "read_file_content":
return {"content": file_content}
if action == "select":
if table == "projects":
return {"rows": _projects}
if table == "tasks":
return {"rows": existing_tasks or []}
if table == "notes":
return {"rows": existing_notes or []}
if table == "timelines":
return {"rows": existing_timelines or []}
return {"rows": []}
if action == "insert":
return {"row": {"id": str(uuid.uuid4()), **data}}
if action == "update":
return {"success": True}
return {}
return _executor, calls
# ── Unit: helper functions ────────────────────────────────────────────────
def test_format_projects_empty():
assert "(no projects" in _format_projects([])
def test_format_projects_with_data():
result = _format_projects([_PROJECTS["alpha"]])
assert "proj-alpha" in result
assert "Project Alpha" in result
def test_format_metadata_empty():
assert _format_metadata({}) == ""
def test_format_metadata_email():
meta = {"subject": "Fix bug", "from": "boss@co.com", "date": "2026-04-07"}
result = _format_metadata(meta)
assert "Fix bug" in result
assert "boss@co.com" in result
def test_get_extraction_rules_match():
rules = _get_extraction_rules(_AGENT_CONFIG, "email_html")
assert "task" in rules.lower()
def test_get_extraction_rules_fallback():
rules = _get_extraction_rules(_AGENT_CONFIG, "plain_text")
assert "extract" in rules.lower()
def test_get_no_match_behavior_from_global_rules():
behavior = _get_no_match_behavior(_AGENT_CONFIG)
assert behavior # non-empty
def test_get_no_match_behavior_default():
behavior = _get_no_match_behavior({})
assert "project" in behavior.lower()
# ── Unit: 2.9 — device offline ───────────────────────────────────────────
@pytest.mark.asyncio
async def test_2_9_device_offline():
"""2.9 No device online → status=error, no executor created."""
config = _make_config()
run_log = _make_run_log(config.id)
mgr = _make_manager(online=False)
with patch("app.core.scout_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
await run_local_agent(_USER_ID, config, run_log, mgr)
_, kwargs = mock_fin.call_args
assert kwargs["status"] == "error"
assert any("not connected" in e for e in kwargs.get("errors", []))
# ── Unit: 2.10 — empty file ──────────────────────────────────────────────
@pytest.mark.asyncio
async def test_2_10_empty_file():
"""2.10 File with empty content → skipped, items_processed=0, success."""
config = _make_config()
run_log = _make_run_log(config.id)
mgr = _make_manager()
executor, calls = _make_executor(
file_path="/emails/empty.html",
file_content="",
projects=[_PROJECTS["alpha"]],
)
with patch("app.core.scout_runner._make_agent_executor", return_value=executor), \
patch("app.core.scout_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
await run_local_agent(_USER_ID, config, run_log, mgr)
_, kwargs = mock_fin.call_args
assert kwargs["items_processed"] == 0
assert kwargs["status"] == "success"
assert kwargs["items_created"] == 0
# ── Unit: 2.8 — items_created count ─────────────────────────────────────
@pytest.mark.asyncio
async def test_2_8_items_created_count():
"""2.8 items_created == number of create_* tool calls per run."""
config = _make_config()
run_log = _make_run_log(config.id)
mgr = _make_manager()
executor, _calls = _make_executor(
file_path="/emails/action.html",
file_content="<html><body><p>Fix the login bug in Project Alpha.</p></body></html>",
projects=[_PROJECTS["alpha"]],
)
async def mock_run_agent(*, _tool_calls_out=None, **kw) -> str:
if _tool_calls_out is not None:
_tool_calls_out.extend(["create_task", "create_note", "update_task"])
return "Done."
with patch("app.core.scout_runner._make_agent_executor", return_value=executor), \
patch("app.core.scout_runner._run_agent_with_tools", side_effect=mock_run_agent), \
patch("app.core.scout_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
await run_local_agent(_USER_ID, config, run_log, mgr)
_, kwargs = mock_fin.call_args
# Only create_task + create_note count (not update_task).
assert kwargs["items_created"] == 2
assert kwargs["items_processed"] == 1
# ── Eval: 2.12.7 — fixture-driven, real LLM + Langfuse scoring ──────────
#
# Cases loaded from tests/fixtures/agent_runner_v2/cases.yaml.
# Supported assertions (from YAML):
# expect_insert: <table> → at least 1 insert in that table
# expect_no_insert: true → zero inserts in any table
# expect_project_id: <id> → any insert carries this projectId
# expect_dedup: true → task inserts == 0 OR task updates >= 1
# ─────────────────────────────────────────────────────────────────────────
@pytest.mark.asyncio
@pytest.mark.eval
async def test_eval_runner(runner_case, pytestconfig):
"""Parametrized eval test — one invocation per YAML case."""
case: dict = runner_case
data_dir = _fixtures_dir(pytestconfig) / "data"
file_content = _read_case_file(case, data_dir)
projects = _resolve_projects(case.get("projects", []))
config = _make_config()
run_log = _make_run_log(config.id)
mgr = _make_manager()
executor, calls = _make_executor(
file_path=case["file_path"],
file_content=file_content,
projects=projects,
existing_tasks=case.get("existing_tasks"),
existing_notes=case.get("existing_notes"),
existing_timelines=case.get("existing_timelines"),
)
lf = get_langfuse()
obs_ctx = lf.start_as_current_observation(
name=f"eval-runner-{case['id']}-{case.get('score_name', 'unknown').replace('.', '-')}",
metadata={"step": "2", "case_id": case["id"]},
) if lf else nullcontext()
with obs_ctx as obs:
with patch("app.core.scout_runner._make_agent_executor", return_value=executor), \
patch("app.core.scout_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
await run_local_agent(_USER_ID, config, run_log, mgr)
_, kwargs = mock_fin.call_args
score, comment = _evaluate_case(case, calls, kwargs)
if obs is not None:
obs.score(
name=case.get("score_name", f"runner.case_{case['id']}"),
value=score,
comment=comment,
)
if lf:
lf.flush()
assert score == 1.0, f"[{case['id']}] {case.get('description', '')}{comment}"
def _evaluate_case(case: dict, calls: list[dict], finalize_kwargs: dict) -> tuple[float, str]:
"""Return (score, comment) for a YAML case given the captured executor calls."""
inserts = [c for c in calls if c["action"] == "insert"]
if case.get("expect_no_insert"):
score = 1.0 if len(inserts) == 0 else 0.0
return score, f"inserts={len(inserts)} (expected 0)"
if "expect_insert" in case:
tables = case["expect_insert"]
if isinstance(tables, str):
tables = [tables]
missing = [t for t in tables if not any(c["table"] == t for c in inserts)]
score = 1.0 if not missing else 0.0
counts = {t: sum(1 for c in inserts if c["table"] == t) for t in tables}
return score, f"inserts={counts}" + (f" missing={missing}" if missing else "")
if "expect_project_id" in case:
expected_pid = case["expect_project_id"]
correct = any(c.get("data", {}).get("projectId") == expected_pid for c in inserts)
score = 1.0 if correct else 0.0
all_pids = [c.get("data", {}).get("projectId") for c in inserts]
return score, f"projectIds={all_pids} (expected {expected_pid!r})"
if case.get("expect_dedup"):
task_creates = [c for c in inserts if c["table"] == "tasks"]
task_updates = [c for c in calls if c["action"] == "update" and c["table"] == "tasks"]
score = 1.0 if len(task_creates) == 0 or len(task_updates) >= 1 else 0.0
return score, f"task_creates={len(task_creates)} task_updates={len(task_updates)}"
return 0.0, "no assertion defined in case"