feat(local-agent-v2): step 2+3 — unified runner + AgentConfig schema

Step 3 (prerequisite): - app/schemas.py: add ContentTypeConfig + AgentConfig Pydantic models - app/models.py: add agent_config (JSON, nullable) to LocalAgentConfig - alembic migration a3b9c0d1e2f3: ADD COLUMN agent_config Step 2 (runner refactor): - Remove _classify_file() and _BATCH_FILE_CLASSIFIER_PROMPT (LLM classification step) - Add Phase A: detect_content_type + preprocess (zero LLM, per file) - Add _UNIFIED_PROCESSING_PROMPT (hot-swappable via Langfuse "unified_processing") - Add helper functions: _format_projects, _format_metadata, _get_extraction_rules, _get_no_match_behavior - Single LLM call per file with tools (classify + extract + create) - Fix items_created: count create_* tool calls via _tool_calls_out param - test_agent_runner_v2.py: 10 cases (2.1-2.10) with Langfuse eval scoring Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 15:00:32 +02:00
parent d91c98f86d
commit fa231a3642
5 changed files with 796 additions and 260 deletions
--- a/tests/test_agent_runner_v2.py
+++ b/tests/test_agent_runner_v2.py
@@ -0,0 +1,587 @@
+"""Tests for Local Agent V2 runner (Step 2).
+
+Covers the unified per-file flow:
+  Phase A — detect + preprocess (Python, zero LLM)
+  Phase B — single LLM call with tools (classify + extract + create)
+
+Test cases:
+  2.1  Happy path: email with action    → create_task called
+  2.2  Happy path: email informative    → create_note called
+  2.3  Happy path: email with date      → create_timeline called
+  2.4  Project matching via filename    → correct project_id used
+  2.5  Project matching via content     → correct project_id used
+  2.6  No project match + global rule   → no create_* called
+  2.7  Deduplication                    → update_task, not create_task
+  2.8  items_created count (unit)       → items_created == N create_* calls
+  2.9  Device offline (unit)            → status=error
+  2.10 Empty file (unit)                → items_processed=0, status=success
+
+Run:
+    pytest tests/test_agent_runner_v2.py -v
+    pytest tests/test_agent_runner_v2.py -v -k "2_9 or 2_10 or 2_8"   # unit only
+    pytest tests/test_agent_runner_v2.py -v -k "eval"                  # LLM evals only
+"""
+
+from __future__ import annotations
+
+import uuid
+from datetime import datetime, timezone
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from app.core.agent_runner import (
+    _format_metadata,
+    _format_projects,
+    _get_extraction_rules,
+    _get_no_match_behavior,
+    _is_overdue,
+    run_local_agent,
+)
+from app.core.device_manager import DeviceConnectionManager
+from app.core.langfuse_client import get_langfuse, get_prompt_or_fallback
+from app.models import AgentRunLog, LocalAgentConfig
+from tests.conftest import TEST_USER_IDS
+
+# ── Constants ─────────────────────────────────────────────────────────────
+
+_USER_ID = TEST_USER_IDS["power"]
+
+_AGENT_CONFIG = {
+    "content_types": [
+        {
+            "id": "email_html",
+            "label": "Email HTML",
+            "detection_hint": "HTML file with From/To/Subject headers",
+            "preprocessing": "email_html",
+            "extraction_prompt": (
+                "If the email contains a direct action request or task assignment → create a task. "
+                "If the email contains informational content, updates, or FYI → create a note. "
+                "If the email mentions a specific date for a meeting or deadline → create a timeline entry."
+            ),
+        }
+    ],
+    "global_rules": [
+        "Se il file non è riconducibile a nessun progetto, non creare alcuna entità."
+    ],
+    "data_types": ["tasks", "notes", "timelines"],
+}
+
+_PROJECT_ALPHA = {"id": "proj-alpha", "name": "Project Alpha", "status": "active"}
+_PROJECT_BETA  = {"id": "proj-beta",  "name": "Project Beta",  "status": "active"}
+
+# ── Sample email content ──────────────────────────────────────────────────
+
+_ACTION_EMAIL = """\
+<html><head></head><body>
+<p><b>From:</b> boss@company.com</p>
+<p><b>To:</b> dev@company.com</p>
+<p><b>Subject:</b> Fix the login bug</p>
+<p><b>Date:</b> 2026-04-07</p>
+<p>Hi,<br>Please fix the login bug in Project Alpha by Friday. High priority!</p>
+</body></html>
+"""
+
+_INFO_EMAIL = """\
+<html><head></head><body>
+<p><b>From:</b> pm@company.com</p>
+<p><b>To:</b> team@company.com</p>
+<p><b>Subject:</b> FYI: New policy for Project Alpha</p>
+<p>Just a heads-up that starting next week all code reviews must be done
+within 24 hours for Project Alpha. No action needed from you now.</p>
+</body></html>
+"""
+
+_DATE_EMAIL = """\
+<html><head></head><body>
+<p><b>From:</b> pm@company.com</p>
+<p><b>Subject:</b> Project Alpha kick-off meeting</p>
+<p>The kick-off meeting for Project Alpha is scheduled for 2026-04-15 at 10:00.</p>
+</body></html>
+"""
+
+_NO_PROJECT_EMAIL = """\
+<html><head></head><body>
+<p><b>From:</b> newsletter@ads.com</p>
+<p><b>Subject:</b> Weekly newsletter</p>
+<p>Check out our latest deals on electronics!</p>
+</body></html>
+"""
+
+_EXISTING_TASK = {
+    "id": "task-existing",
+    "title": "Fix the login bug",
+    "status": "todo",
+    "priority": "medium",
+}
+
+
+# ── Test helpers ──────────────────────────────────────────────────────────
+
+
+def _make_config(
+    agent_config: dict | None = None,
+    directory: str = "/emails",
+    device_id: str = "dev-001",
+) -> LocalAgentConfig:
+    return LocalAgentConfig(
+        id=str(uuid.uuid4()),
+        user_id=_USER_ID,
+        device_id=device_id,
+        name="Test V2 Agent",
+        directory_paths=[directory],
+        data_types=["tasks", "notes", "timelines"],
+        prompt_template="",
+        agent_config=agent_config or _AGENT_CONFIG,
+        file_extensions=[".html", ".eml"],
+        schedule_cron="0 */6 * * *",
+        enabled=True,
+        last_run_at=None,
+    )
+
+
+def _make_run_log(agent_id: str) -> AgentRunLog:
+    return AgentRunLog(
+        id=str(uuid.uuid4()),
+        agent_id=agent_id,
+        agent_type="local",
+        user_id=_USER_ID,
+        status="running",
+        started_at=datetime.now(timezone.utc),
+    )
+
+
+def _make_manager(online: bool = True) -> DeviceConnectionManager:
+    mgr = DeviceConnectionManager()
+    if online:
+        ws = MagicMock()
+        ws.send_text = AsyncMock()
+        mgr.register(_USER_ID, "dev-001", ws)
+    return mgr
+
+
+def _make_executor(
+    file_path: str,
+    file_content: str,
+    projects: list[dict] | None = None,
+    existing_tasks: list[dict] | None = None,
+    existing_notes: list[dict] | None = None,
+    existing_timelines: list[dict] | None = None,
+) -> tuple[Any, list[dict]]:
+    """Return (async_executor, captured_calls).
+
+    The executor handles all ``execute_on_client`` payloads:
+    directory listing, file reading, project/entity fetching, and CRUD.
+    """
+    calls: list[dict] = []
+    _projects = projects or [_PROJECT_ALPHA, _PROJECT_BETA]
+
+    async def _executor(payload: dict) -> dict:
+        action = payload.get("action", "")
+        table = payload.get("table", "")
+        data = payload.get("data") or {}
+        calls.append({"action": action, "table": table, "data": data})
+
+        if action == "list_directory":
+            path = data.get("path", "") or payload.get("data", {}).get("path", "")
+            return {
+                "entries": [{"type": "file", "path": file_path}]
+            }
+
+        if action == "get_file_metadata":
+            return {"modifiedAt": None}
+
+        if action == "read_file_content":
+            return {"content": file_content}
+
+        if action == "select":
+            if table == "projects":
+                return {"rows": _projects}
+            if table == "tasks":
+                return {"rows": existing_tasks or []}
+            if table == "notes":
+                return {"rows": existing_notes or []}
+            if table == "timelines":
+                return {"rows": existing_timelines or []}
+            return {"rows": []}
+
+        if action == "insert":
+            return {"row": {"id": str(uuid.uuid4()), **data}}
+
+        if action == "update":
+            return {"success": True}
+
+        return {}
+
+    return _executor, calls
+
+
+# ── Unit: helper functions ────────────────────────────────────────────────
+
+
+def test_format_projects_empty():
+    assert "(no projects" in _format_projects([])
+
+
+def test_format_projects_with_data():
+    result = _format_projects([_PROJECT_ALPHA])
+    assert "proj-alpha" in result
+    assert "Project Alpha" in result
+
+
+def test_format_metadata_empty():
+    assert _format_metadata({}) == ""
+
+
+def test_format_metadata_email():
+    meta = {"subject": "Fix bug", "from": "boss@co.com", "date": "2026-04-07"}
+    result = _format_metadata(meta)
+    assert "Fix bug" in result
+    assert "boss@co.com" in result
+
+
+def test_get_extraction_rules_match():
+    rules = _get_extraction_rules(_AGENT_CONFIG, "email_html")
+    assert "task" in rules.lower()
+
+
+def test_get_extraction_rules_fallback():
+    rules = _get_extraction_rules(_AGENT_CONFIG, "plain_text")
+    assert "extract" in rules.lower()
+
+
+def test_get_no_match_behavior_from_global_rules():
+    behavior = _get_no_match_behavior(_AGENT_CONFIG)
+    # The global rule says "non creare alcuna entità" → skip behavior
+    assert behavior  # non-empty
+
+
+def test_get_no_match_behavior_default():
+    behavior = _get_no_match_behavior({})
+    assert "project" in behavior.lower()
+
+
+# ── Unit: 2.9 — device offline ───────────────────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_2_9_device_offline():
+    """2.9 No device online → status=error, no executor created."""
+    config = _make_config()
+    run_log = _make_run_log(config.id)
+    mgr = _make_manager(online=False)
+
+    with patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
+        await run_local_agent(_USER_ID, config, run_log, mgr)
+
+    _, kwargs = mock_fin.call_args
+    assert kwargs["status"] == "error"
+    assert any("not connected" in e for e in kwargs.get("errors", []))
+
+
+# ── Unit: 2.10 — empty file ──────────────────────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_2_10_empty_file():
+    """2.10 File with empty content → skipped, items_processed=0, success."""
+    config = _make_config()
+    run_log = _make_run_log(config.id)
+    mgr = _make_manager()
+
+    executor, calls = _make_executor(
+        file_path="/emails/empty.html",
+        file_content="",  # empty
+        projects=[_PROJECT_ALPHA],
+    )
+
+    with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
+         patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
+        await run_local_agent(_USER_ID, config, run_log, mgr)
+
+    _, kwargs = mock_fin.call_args
+    assert kwargs["items_processed"] == 0
+    assert kwargs["status"] == "success"
+    assert kwargs["items_created"] == 0
+
+
+# ── Unit: 2.8 — items_created count ─────────────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_2_8_items_created_count():
+    """2.8 items_created == number of create_* tool calls per run."""
+    config = _make_config()
+    run_log = _make_run_log(config.id)
+    mgr = _make_manager()
+
+    executor, _calls = _make_executor(
+        file_path="/emails/action.html",
+        file_content=_ACTION_EMAIL,
+        projects=[_PROJECT_ALPHA],
+    )
+
+    # Simulate LLM calling create_task twice and update_note once.
+    async def mock_run_agent(*, _tool_calls_out=None, **kw) -> str:
+        if _tool_calls_out is not None:
+            _tool_calls_out.extend(["create_task", "create_note", "update_task"])
+        return "Done."
+
+    with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
+         patch("app.core.agent_runner._run_agent_with_tools", side_effect=mock_run_agent), \
+         patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
+        await run_local_agent(_USER_ID, config, run_log, mgr)
+
+    _, kwargs = mock_fin.call_args
+    # Only create_task + create_note count (not update_task).
+    assert kwargs["items_created"] == 2
+    assert kwargs["items_processed"] == 1
+
+
+# ── Eval: 2.1–2.7 (real LLM + Langfuse scoring) ──────────────────────────
+
+
+@pytest.mark.asyncio
+@pytest.mark.eval
+async def test_2_1_email_to_task():
+    """2.1 Action email → LLM calls create_task. Score: runner.email_to_task."""
+    lf = get_langfuse()
+    trace = lf.trace(
+        name="eval-runner-2.1-email-to-task",
+        metadata={"step": "2"},
+    ) if lf else None
+
+    config = _make_config()
+    run_log = _make_run_log(config.id)
+    mgr = _make_manager()
+
+    executor, calls = _make_executor(
+        file_path="/emails/ProjectAlpha_action.html",
+        file_content=_ACTION_EMAIL,
+        projects=[_PROJECT_ALPHA, _PROJECT_BETA],
+    )
+
+    with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
+         patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
+        await run_local_agent(_USER_ID, config, run_log, mgr)
+
+    _, kwargs = mock_fin.call_args
+    task_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "tasks"]
+    score = 1.0 if len(task_creates) >= 1 else 0.0
+
+    if lf and trace:
+        lf.score(
+            trace_id=trace.id,
+            name="runner.email_to_task",
+            value=score,
+            comment=f"task_creates={len(task_creates)} items_created={kwargs.get('items_created')}",
+        )
+        lf.flush()
+
+    assert score == 1.0, f"Expected at least 1 task created, got {len(task_creates)}"
+
+
+@pytest.mark.asyncio
+@pytest.mark.eval
+async def test_2_2_email_to_note():
+    """2.2 Informational email → LLM calls create_note. Score: runner.email_to_note."""
+    lf = get_langfuse()
+    trace = lf.trace(name="eval-runner-2.2-email-to-note", metadata={"step": "2"}) if lf else None
+
+    config = _make_config()
+    run_log = _make_run_log(config.id)
+    mgr = _make_manager()
+
+    executor, calls = _make_executor(
+        file_path="/emails/ProjectAlpha_info.html",
+        file_content=_INFO_EMAIL,
+        projects=[_PROJECT_ALPHA, _PROJECT_BETA],
+    )
+
+    with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
+         patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
+        await run_local_agent(_USER_ID, config, run_log, mgr)
+
+    note_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "notes"]
+    score = 1.0 if len(note_creates) >= 1 else 0.0
+
+    if lf and trace:
+        lf.score(trace_id=trace.id, name="runner.email_to_note", value=score,
+                 comment=f"note_creates={len(note_creates)}")
+        lf.flush()
+
+    assert score == 1.0, f"Expected at least 1 note created, got {len(note_creates)}"
+
+
+@pytest.mark.asyncio
+@pytest.mark.eval
+async def test_2_3_email_to_timeline():
+    """2.3 Email with event date → LLM calls create_timeline. Score: runner.email_to_timeline."""
+    lf = get_langfuse()
+    trace = lf.trace(name="eval-runner-2.3-email-to-timeline", metadata={"step": "2"}) if lf else None
+
+    config = _make_config()
+    run_log = _make_run_log(config.id)
+    mgr = _make_manager()
+
+    executor, calls = _make_executor(
+        file_path="/emails/ProjectAlpha_kickoff.html",
+        file_content=_DATE_EMAIL,
+        projects=[_PROJECT_ALPHA, _PROJECT_BETA],
+    )
+
+    with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
+         patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
+        await run_local_agent(_USER_ID, config, run_log, mgr)
+
+    tl_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "timelines"]
+    score = 1.0 if len(tl_creates) >= 1 else 0.0
+
+    if lf and trace:
+        lf.score(trace_id=trace.id, name="runner.email_to_timeline", value=score,
+                 comment=f"timeline_creates={len(tl_creates)}")
+        lf.flush()
+
+    assert score == 1.0, f"Expected at least 1 timeline created, got {len(tl_creates)}"
+
+
+@pytest.mark.asyncio
+@pytest.mark.eval
+async def test_2_4_project_matching_filename():
+    """2.4 Filename contains 'ProjectAlpha' → LLM assigns to proj-alpha. Score: runner.project_filename."""
+    lf = get_langfuse()
+    trace = lf.trace(name="eval-runner-2.4-project-filename", metadata={"step": "2"}) if lf else None
+
+    config = _make_config()
+    run_log = _make_run_log(config.id)
+    mgr = _make_manager()
+
+    executor, calls = _make_executor(
+        file_path="/emails/ProjectAlpha_report.html",
+        file_content=_ACTION_EMAIL,
+        projects=[_PROJECT_ALPHA, _PROJECT_BETA],
+    )
+
+    with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
+         patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
+        await run_local_agent(_USER_ID, config, run_log, mgr)
+
+    # Check that project_id = proj-alpha was used in any insert
+    inserts = [c for c in calls if c["action"] == "insert"]
+    correct_project = any(
+        c.get("data", {}).get("projectId") == "proj-alpha"
+        for c in inserts
+    )
+    score = 1.0 if correct_project else 0.0
+
+    if lf and trace:
+        lf.score(trace_id=trace.id, name="runner.project_filename", value=score)
+        lf.flush()
+
+    assert score == 1.0, "Expected inserts to use proj-alpha based on filename"
+
+
+@pytest.mark.asyncio
+@pytest.mark.eval
+async def test_2_5_project_matching_content():
+    """2.5 Email body mentions 'Project Alpha' → correct project assigned. Score: runner.project_content."""
+    lf = get_langfuse()
+    trace = lf.trace(name="eval-runner-2.5-project-content", metadata={"step": "2"}) if lf else None
+
+    config = _make_config()
+    run_log = _make_run_log(config.id)
+    mgr = _make_manager()
+
+    executor, calls = _make_executor(
+        file_path="/emails/email_001.html",  # generic filename, no project hint
+        file_content=_ACTION_EMAIL,  # body mentions "Project Alpha"
+        projects=[_PROJECT_ALPHA, _PROJECT_BETA],
+    )
+
+    with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
+         patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
+        await run_local_agent(_USER_ID, config, run_log, mgr)
+
+    inserts = [c for c in calls if c["action"] == "insert"]
+    correct_project = any(
+        c.get("data", {}).get("projectId") == "proj-alpha"
+        for c in inserts
+    )
+    score = 1.0 if correct_project else 0.0
+
+    if lf and trace:
+        lf.score(trace_id=trace.id, name="runner.project_content", value=score)
+        lf.flush()
+
+    assert score == 1.0, "Expected inserts to use proj-alpha based on email body content"
+
+
+@pytest.mark.asyncio
+@pytest.mark.eval
+async def test_2_6_no_project_match_global_rule():
+    """2.6 Newsletter email + global rule 'no project = no entities' → no creates. Score: runner.no_project."""
+    lf = get_langfuse()
+    trace = lf.trace(name="eval-runner-2.6-no-project", metadata={"step": "2"}) if lf else None
+
+    config = _make_config()
+    run_log = _make_run_log(config.id)
+    mgr = _make_manager()
+
+    executor, calls = _make_executor(
+        file_path="/emails/newsletter.html",
+        file_content=_NO_PROJECT_EMAIL,
+        projects=[_PROJECT_ALPHA, _PROJECT_BETA],
+    )
+
+    with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
+         patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
+        await run_local_agent(_USER_ID, config, run_log, mgr)
+
+    _, kwargs = mock_fin.call_args
+    inserts = [c for c in calls if c["action"] == "insert"]
+    score = 1.0 if len(inserts) == 0 else 0.0
+
+    if lf and trace:
+        lf.score(trace_id=trace.id, name="runner.no_project", value=score,
+                 comment=f"inserts={len(inserts)}")
+        lf.flush()
+
+    assert score == 1.0, f"Expected 0 inserts for unmatched newsletter, got {len(inserts)}"
+
+
+@pytest.mark.asyncio
+@pytest.mark.eval
+async def test_2_7_deduplication():
+    """2.7 Existing task with same title → LLM calls update_task, not create_task. Score: runner.dedup."""
+    lf = get_langfuse()
+    trace = lf.trace(name="eval-runner-2.7-dedup", metadata={"step": "2"}) if lf else None
+
+    config = _make_config()
+    run_log = _make_run_log(config.id)
+    mgr = _make_manager()
+
+    executor, calls = _make_executor(
+        file_path="/emails/ProjectAlpha_followup.html",
+        file_content=_ACTION_EMAIL,  # "Fix the login bug" — already exists
+        projects=[_PROJECT_ALPHA],
+        existing_tasks=[_EXISTING_TASK],  # task already exists
+    )
+
+    with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
+         patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
+        await run_local_agent(_USER_ID, config, run_log, mgr)
+
+    task_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "tasks"]
+    task_updates = [c for c in calls if c["action"] == "update" and c.get("table") == "tasks"]
+    # Prefer update over create
+    score = 1.0 if len(task_creates) == 0 or len(task_updates) >= 1 else 0.0
+
+    if lf and trace:
+        lf.score(trace_id=trace.id, name="runner.dedup", value=score,
+                 comment=f"creates={len(task_creates)} updates={len(task_updates)}")
+        lf.flush()
+
+    assert score == 1.0, (
+        f"Expected deduplication: creates={len(task_creates)}, updates={len(task_updates)}"
+    )