feat(local-agent-v2): step 2+3 — unified runner + AgentConfig schema

Step 3 (prerequisite):
- app/schemas.py: add ContentTypeConfig + AgentConfig Pydantic models
- app/models.py: add agent_config (JSON, nullable) to LocalAgentConfig
- alembic migration a3b9c0d1e2f3: ADD COLUMN agent_config

Step 2 (runner refactor):
- Remove _classify_file() and _BATCH_FILE_CLASSIFIER_PROMPT (LLM classification step)
- Add Phase A: detect_content_type + preprocess (zero LLM, per file)
- Add _UNIFIED_PROCESSING_PROMPT (hot-swappable via Langfuse "unified_processing")
- Add helper functions: _format_projects, _format_metadata, _get_extraction_rules,
  _get_no_match_behavior
- Single LLM call per file with tools (classify + extract + create)
- Fix items_created: count create_* tool calls via _tool_calls_out param
- test_agent_runner_v2.py: 10 cases (2.1-2.10) with Langfuse eval scoring

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Roberto Musso
2026-04-07 15:00:32 +02:00
parent d91c98f86d
commit fa231a3642
5 changed files with 796 additions and 260 deletions

View File

@@ -0,0 +1,587 @@
"""Tests for Local Agent V2 runner (Step 2).
Covers the unified per-file flow:
Phase A — detect + preprocess (Python, zero LLM)
Phase B — single LLM call with tools (classify + extract + create)
Test cases:
2.1 Happy path: email with action → create_task called
2.2 Happy path: email informative → create_note called
2.3 Happy path: email with date → create_timeline called
2.4 Project matching via filename → correct project_id used
2.5 Project matching via content → correct project_id used
2.6 No project match + global rule → no create_* called
2.7 Deduplication → update_task, not create_task
2.8 items_created count (unit) → items_created == N create_* calls
2.9 Device offline (unit) → status=error
2.10 Empty file (unit) → items_processed=0, status=success
Run:
pytest tests/test_agent_runner_v2.py -v
pytest tests/test_agent_runner_v2.py -v -k "2_9 or 2_10 or 2_8" # unit only
pytest tests/test_agent_runner_v2.py -v -k "eval" # LLM evals only
"""
from __future__ import annotations
import uuid
from datetime import datetime, timezone
from typing import Any
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from app.core.agent_runner import (
_format_metadata,
_format_projects,
_get_extraction_rules,
_get_no_match_behavior,
_is_overdue,
run_local_agent,
)
from app.core.device_manager import DeviceConnectionManager
from app.core.langfuse_client import get_langfuse, get_prompt_or_fallback
from app.models import AgentRunLog, LocalAgentConfig
from tests.conftest import TEST_USER_IDS
# ── Constants ─────────────────────────────────────────────────────────────
_USER_ID = TEST_USER_IDS["power"]
_AGENT_CONFIG = {
"content_types": [
{
"id": "email_html",
"label": "Email HTML",
"detection_hint": "HTML file with From/To/Subject headers",
"preprocessing": "email_html",
"extraction_prompt": (
"If the email contains a direct action request or task assignment → create a task. "
"If the email contains informational content, updates, or FYI → create a note. "
"If the email mentions a specific date for a meeting or deadline → create a timeline entry."
),
}
],
"global_rules": [
"Se il file non è riconducibile a nessun progetto, non creare alcuna entità."
],
"data_types": ["tasks", "notes", "timelines"],
}
_PROJECT_ALPHA = {"id": "proj-alpha", "name": "Project Alpha", "status": "active"}
_PROJECT_BETA = {"id": "proj-beta", "name": "Project Beta", "status": "active"}
# ── Sample email content ──────────────────────────────────────────────────
_ACTION_EMAIL = """\
<html><head></head><body>
<p><b>From:</b> boss@company.com</p>
<p><b>To:</b> dev@company.com</p>
<p><b>Subject:</b> Fix the login bug</p>
<p><b>Date:</b> 2026-04-07</p>
<p>Hi,<br>Please fix the login bug in Project Alpha by Friday. High priority!</p>
</body></html>
"""
_INFO_EMAIL = """\
<html><head></head><body>
<p><b>From:</b> pm@company.com</p>
<p><b>To:</b> team@company.com</p>
<p><b>Subject:</b> FYI: New policy for Project Alpha</p>
<p>Just a heads-up that starting next week all code reviews must be done
within 24 hours for Project Alpha. No action needed from you now.</p>
</body></html>
"""
_DATE_EMAIL = """\
<html><head></head><body>
<p><b>From:</b> pm@company.com</p>
<p><b>Subject:</b> Project Alpha kick-off meeting</p>
<p>The kick-off meeting for Project Alpha is scheduled for 2026-04-15 at 10:00.</p>
</body></html>
"""
_NO_PROJECT_EMAIL = """\
<html><head></head><body>
<p><b>From:</b> newsletter@ads.com</p>
<p><b>Subject:</b> Weekly newsletter</p>
<p>Check out our latest deals on electronics!</p>
</body></html>
"""
_EXISTING_TASK = {
"id": "task-existing",
"title": "Fix the login bug",
"status": "todo",
"priority": "medium",
}
# ── Test helpers ──────────────────────────────────────────────────────────
def _make_config(
agent_config: dict | None = None,
directory: str = "/emails",
device_id: str = "dev-001",
) -> LocalAgentConfig:
return LocalAgentConfig(
id=str(uuid.uuid4()),
user_id=_USER_ID,
device_id=device_id,
name="Test V2 Agent",
directory_paths=[directory],
data_types=["tasks", "notes", "timelines"],
prompt_template="",
agent_config=agent_config or _AGENT_CONFIG,
file_extensions=[".html", ".eml"],
schedule_cron="0 */6 * * *",
enabled=True,
last_run_at=None,
)
def _make_run_log(agent_id: str) -> AgentRunLog:
return AgentRunLog(
id=str(uuid.uuid4()),
agent_id=agent_id,
agent_type="local",
user_id=_USER_ID,
status="running",
started_at=datetime.now(timezone.utc),
)
def _make_manager(online: bool = True) -> DeviceConnectionManager:
mgr = DeviceConnectionManager()
if online:
ws = MagicMock()
ws.send_text = AsyncMock()
mgr.register(_USER_ID, "dev-001", ws)
return mgr
def _make_executor(
file_path: str,
file_content: str,
projects: list[dict] | None = None,
existing_tasks: list[dict] | None = None,
existing_notes: list[dict] | None = None,
existing_timelines: list[dict] | None = None,
) -> tuple[Any, list[dict]]:
"""Return (async_executor, captured_calls).
The executor handles all ``execute_on_client`` payloads:
directory listing, file reading, project/entity fetching, and CRUD.
"""
calls: list[dict] = []
_projects = projects or [_PROJECT_ALPHA, _PROJECT_BETA]
async def _executor(payload: dict) -> dict:
action = payload.get("action", "")
table = payload.get("table", "")
data = payload.get("data") or {}
calls.append({"action": action, "table": table, "data": data})
if action == "list_directory":
path = data.get("path", "") or payload.get("data", {}).get("path", "")
return {
"entries": [{"type": "file", "path": file_path}]
}
if action == "get_file_metadata":
return {"modifiedAt": None}
if action == "read_file_content":
return {"content": file_content}
if action == "select":
if table == "projects":
return {"rows": _projects}
if table == "tasks":
return {"rows": existing_tasks or []}
if table == "notes":
return {"rows": existing_notes or []}
if table == "timelines":
return {"rows": existing_timelines or []}
return {"rows": []}
if action == "insert":
return {"row": {"id": str(uuid.uuid4()), **data}}
if action == "update":
return {"success": True}
return {}
return _executor, calls
# ── Unit: helper functions ────────────────────────────────────────────────
def test_format_projects_empty():
assert "(no projects" in _format_projects([])
def test_format_projects_with_data():
result = _format_projects([_PROJECT_ALPHA])
assert "proj-alpha" in result
assert "Project Alpha" in result
def test_format_metadata_empty():
assert _format_metadata({}) == ""
def test_format_metadata_email():
meta = {"subject": "Fix bug", "from": "boss@co.com", "date": "2026-04-07"}
result = _format_metadata(meta)
assert "Fix bug" in result
assert "boss@co.com" in result
def test_get_extraction_rules_match():
rules = _get_extraction_rules(_AGENT_CONFIG, "email_html")
assert "task" in rules.lower()
def test_get_extraction_rules_fallback():
rules = _get_extraction_rules(_AGENT_CONFIG, "plain_text")
assert "extract" in rules.lower()
def test_get_no_match_behavior_from_global_rules():
behavior = _get_no_match_behavior(_AGENT_CONFIG)
# The global rule says "non creare alcuna entità" → skip behavior
assert behavior # non-empty
def test_get_no_match_behavior_default():
behavior = _get_no_match_behavior({})
assert "project" in behavior.lower()
# ── Unit: 2.9 — device offline ───────────────────────────────────────────
@pytest.mark.asyncio
async def test_2_9_device_offline():
"""2.9 No device online → status=error, no executor created."""
config = _make_config()
run_log = _make_run_log(config.id)
mgr = _make_manager(online=False)
with patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
await run_local_agent(_USER_ID, config, run_log, mgr)
_, kwargs = mock_fin.call_args
assert kwargs["status"] == "error"
assert any("not connected" in e for e in kwargs.get("errors", []))
# ── Unit: 2.10 — empty file ──────────────────────────────────────────────
@pytest.mark.asyncio
async def test_2_10_empty_file():
"""2.10 File with empty content → skipped, items_processed=0, success."""
config = _make_config()
run_log = _make_run_log(config.id)
mgr = _make_manager()
executor, calls = _make_executor(
file_path="/emails/empty.html",
file_content="", # empty
projects=[_PROJECT_ALPHA],
)
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
await run_local_agent(_USER_ID, config, run_log, mgr)
_, kwargs = mock_fin.call_args
assert kwargs["items_processed"] == 0
assert kwargs["status"] == "success"
assert kwargs["items_created"] == 0
# ── Unit: 2.8 — items_created count ─────────────────────────────────────
@pytest.mark.asyncio
async def test_2_8_items_created_count():
"""2.8 items_created == number of create_* tool calls per run."""
config = _make_config()
run_log = _make_run_log(config.id)
mgr = _make_manager()
executor, _calls = _make_executor(
file_path="/emails/action.html",
file_content=_ACTION_EMAIL,
projects=[_PROJECT_ALPHA],
)
# Simulate LLM calling create_task twice and update_note once.
async def mock_run_agent(*, _tool_calls_out=None, **kw) -> str:
if _tool_calls_out is not None:
_tool_calls_out.extend(["create_task", "create_note", "update_task"])
return "Done."
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
patch("app.core.agent_runner._run_agent_with_tools", side_effect=mock_run_agent), \
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
await run_local_agent(_USER_ID, config, run_log, mgr)
_, kwargs = mock_fin.call_args
# Only create_task + create_note count (not update_task).
assert kwargs["items_created"] == 2
assert kwargs["items_processed"] == 1
# ── Eval: 2.12.7 (real LLM + Langfuse scoring) ──────────────────────────
@pytest.mark.asyncio
@pytest.mark.eval
async def test_2_1_email_to_task():
"""2.1 Action email → LLM calls create_task. Score: runner.email_to_task."""
lf = get_langfuse()
trace = lf.trace(
name="eval-runner-2.1-email-to-task",
metadata={"step": "2"},
) if lf else None
config = _make_config()
run_log = _make_run_log(config.id)
mgr = _make_manager()
executor, calls = _make_executor(
file_path="/emails/ProjectAlpha_action.html",
file_content=_ACTION_EMAIL,
projects=[_PROJECT_ALPHA, _PROJECT_BETA],
)
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
await run_local_agent(_USER_ID, config, run_log, mgr)
_, kwargs = mock_fin.call_args
task_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "tasks"]
score = 1.0 if len(task_creates) >= 1 else 0.0
if lf and trace:
lf.score(
trace_id=trace.id,
name="runner.email_to_task",
value=score,
comment=f"task_creates={len(task_creates)} items_created={kwargs.get('items_created')}",
)
lf.flush()
assert score == 1.0, f"Expected at least 1 task created, got {len(task_creates)}"
@pytest.mark.asyncio
@pytest.mark.eval
async def test_2_2_email_to_note():
"""2.2 Informational email → LLM calls create_note. Score: runner.email_to_note."""
lf = get_langfuse()
trace = lf.trace(name="eval-runner-2.2-email-to-note", metadata={"step": "2"}) if lf else None
config = _make_config()
run_log = _make_run_log(config.id)
mgr = _make_manager()
executor, calls = _make_executor(
file_path="/emails/ProjectAlpha_info.html",
file_content=_INFO_EMAIL,
projects=[_PROJECT_ALPHA, _PROJECT_BETA],
)
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
await run_local_agent(_USER_ID, config, run_log, mgr)
note_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "notes"]
score = 1.0 if len(note_creates) >= 1 else 0.0
if lf and trace:
lf.score(trace_id=trace.id, name="runner.email_to_note", value=score,
comment=f"note_creates={len(note_creates)}")
lf.flush()
assert score == 1.0, f"Expected at least 1 note created, got {len(note_creates)}"
@pytest.mark.asyncio
@pytest.mark.eval
async def test_2_3_email_to_timeline():
"""2.3 Email with event date → LLM calls create_timeline. Score: runner.email_to_timeline."""
lf = get_langfuse()
trace = lf.trace(name="eval-runner-2.3-email-to-timeline", metadata={"step": "2"}) if lf else None
config = _make_config()
run_log = _make_run_log(config.id)
mgr = _make_manager()
executor, calls = _make_executor(
file_path="/emails/ProjectAlpha_kickoff.html",
file_content=_DATE_EMAIL,
projects=[_PROJECT_ALPHA, _PROJECT_BETA],
)
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
await run_local_agent(_USER_ID, config, run_log, mgr)
tl_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "timelines"]
score = 1.0 if len(tl_creates) >= 1 else 0.0
if lf and trace:
lf.score(trace_id=trace.id, name="runner.email_to_timeline", value=score,
comment=f"timeline_creates={len(tl_creates)}")
lf.flush()
assert score == 1.0, f"Expected at least 1 timeline created, got {len(tl_creates)}"
@pytest.mark.asyncio
@pytest.mark.eval
async def test_2_4_project_matching_filename():
"""2.4 Filename contains 'ProjectAlpha' → LLM assigns to proj-alpha. Score: runner.project_filename."""
lf = get_langfuse()
trace = lf.trace(name="eval-runner-2.4-project-filename", metadata={"step": "2"}) if lf else None
config = _make_config()
run_log = _make_run_log(config.id)
mgr = _make_manager()
executor, calls = _make_executor(
file_path="/emails/ProjectAlpha_report.html",
file_content=_ACTION_EMAIL,
projects=[_PROJECT_ALPHA, _PROJECT_BETA],
)
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
await run_local_agent(_USER_ID, config, run_log, mgr)
# Check that project_id = proj-alpha was used in any insert
inserts = [c for c in calls if c["action"] == "insert"]
correct_project = any(
c.get("data", {}).get("projectId") == "proj-alpha"
for c in inserts
)
score = 1.0 if correct_project else 0.0
if lf and trace:
lf.score(trace_id=trace.id, name="runner.project_filename", value=score)
lf.flush()
assert score == 1.0, "Expected inserts to use proj-alpha based on filename"
@pytest.mark.asyncio
@pytest.mark.eval
async def test_2_5_project_matching_content():
"""2.5 Email body mentions 'Project Alpha' → correct project assigned. Score: runner.project_content."""
lf = get_langfuse()
trace = lf.trace(name="eval-runner-2.5-project-content", metadata={"step": "2"}) if lf else None
config = _make_config()
run_log = _make_run_log(config.id)
mgr = _make_manager()
executor, calls = _make_executor(
file_path="/emails/email_001.html", # generic filename, no project hint
file_content=_ACTION_EMAIL, # body mentions "Project Alpha"
projects=[_PROJECT_ALPHA, _PROJECT_BETA],
)
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
await run_local_agent(_USER_ID, config, run_log, mgr)
inserts = [c for c in calls if c["action"] == "insert"]
correct_project = any(
c.get("data", {}).get("projectId") == "proj-alpha"
for c in inserts
)
score = 1.0 if correct_project else 0.0
if lf and trace:
lf.score(trace_id=trace.id, name="runner.project_content", value=score)
lf.flush()
assert score == 1.0, "Expected inserts to use proj-alpha based on email body content"
@pytest.mark.asyncio
@pytest.mark.eval
async def test_2_6_no_project_match_global_rule():
"""2.6 Newsletter email + global rule 'no project = no entities' → no creates. Score: runner.no_project."""
lf = get_langfuse()
trace = lf.trace(name="eval-runner-2.6-no-project", metadata={"step": "2"}) if lf else None
config = _make_config()
run_log = _make_run_log(config.id)
mgr = _make_manager()
executor, calls = _make_executor(
file_path="/emails/newsletter.html",
file_content=_NO_PROJECT_EMAIL,
projects=[_PROJECT_ALPHA, _PROJECT_BETA],
)
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
await run_local_agent(_USER_ID, config, run_log, mgr)
_, kwargs = mock_fin.call_args
inserts = [c for c in calls if c["action"] == "insert"]
score = 1.0 if len(inserts) == 0 else 0.0
if lf and trace:
lf.score(trace_id=trace.id, name="runner.no_project", value=score,
comment=f"inserts={len(inserts)}")
lf.flush()
assert score == 1.0, f"Expected 0 inserts for unmatched newsletter, got {len(inserts)}"
@pytest.mark.asyncio
@pytest.mark.eval
async def test_2_7_deduplication():
"""2.7 Existing task with same title → LLM calls update_task, not create_task. Score: runner.dedup."""
lf = get_langfuse()
trace = lf.trace(name="eval-runner-2.7-dedup", metadata={"step": "2"}) if lf else None
config = _make_config()
run_log = _make_run_log(config.id)
mgr = _make_manager()
executor, calls = _make_executor(
file_path="/emails/ProjectAlpha_followup.html",
file_content=_ACTION_EMAIL, # "Fix the login bug" — already exists
projects=[_PROJECT_ALPHA],
existing_tasks=[_EXISTING_TASK], # task already exists
)
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
await run_local_agent(_USER_ID, config, run_log, mgr)
task_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "tasks"]
task_updates = [c for c in calls if c["action"] == "update" and c.get("table") == "tasks"]
# Prefer update over create
score = 1.0 if len(task_creates) == 0 or len(task_updates) >= 1 else 0.0
if lf and trace:
lf.score(trace_id=trace.id, name="runner.dedup", value=score,
comment=f"creates={len(task_creates)} updates={len(task_updates)}")
lf.flush()
assert score == 1.0, (
f"Expected deduplication: creates={len(task_creates)}, updates={len(task_updates)}"
)