feat(local-agent-v2): step 2+3 — unified runner + AgentConfig schema
Step 3 (prerequisite): - app/schemas.py: add ContentTypeConfig + AgentConfig Pydantic models - app/models.py: add agent_config (JSON, nullable) to LocalAgentConfig - alembic migration a3b9c0d1e2f3: ADD COLUMN agent_config Step 2 (runner refactor): - Remove _classify_file() and _BATCH_FILE_CLASSIFIER_PROMPT (LLM classification step) - Add Phase A: detect_content_type + preprocess (zero LLM, per file) - Add _UNIFIED_PROCESSING_PROMPT (hot-swappable via Langfuse "unified_processing") - Add helper functions: _format_projects, _format_metadata, _get_extraction_rules, _get_no_match_behavior - Single LLM call per file with tools (classify + extract + create) - Fix items_created: count create_* tool calls via _tool_calls_out param - test_agent_runner_v2.py: 10 cases (2.1-2.10) with Langfuse eval scoring Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
587
tests/test_agent_runner_v2.py
Normal file
587
tests/test_agent_runner_v2.py
Normal file
@@ -0,0 +1,587 @@
|
||||
"""Tests for Local Agent V2 runner (Step 2).
|
||||
|
||||
Covers the unified per-file flow:
|
||||
Phase A — detect + preprocess (Python, zero LLM)
|
||||
Phase B — single LLM call with tools (classify + extract + create)
|
||||
|
||||
Test cases:
|
||||
2.1 Happy path: email with action → create_task called
|
||||
2.2 Happy path: email informative → create_note called
|
||||
2.3 Happy path: email with date → create_timeline called
|
||||
2.4 Project matching via filename → correct project_id used
|
||||
2.5 Project matching via content → correct project_id used
|
||||
2.6 No project match + global rule → no create_* called
|
||||
2.7 Deduplication → update_task, not create_task
|
||||
2.8 items_created count (unit) → items_created == N create_* calls
|
||||
2.9 Device offline (unit) → status=error
|
||||
2.10 Empty file (unit) → items_processed=0, status=success
|
||||
|
||||
Run:
|
||||
pytest tests/test_agent_runner_v2.py -v
|
||||
pytest tests/test_agent_runner_v2.py -v -k "2_9 or 2_10 or 2_8" # unit only
|
||||
pytest tests/test_agent_runner_v2.py -v -k "eval" # LLM evals only
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from app.core.agent_runner import (
|
||||
_format_metadata,
|
||||
_format_projects,
|
||||
_get_extraction_rules,
|
||||
_get_no_match_behavior,
|
||||
_is_overdue,
|
||||
run_local_agent,
|
||||
)
|
||||
from app.core.device_manager import DeviceConnectionManager
|
||||
from app.core.langfuse_client import get_langfuse, get_prompt_or_fallback
|
||||
from app.models import AgentRunLog, LocalAgentConfig
|
||||
from tests.conftest import TEST_USER_IDS
|
||||
|
||||
# ── Constants ─────────────────────────────────────────────────────────────
|
||||
|
||||
_USER_ID = TEST_USER_IDS["power"]
|
||||
|
||||
_AGENT_CONFIG = {
|
||||
"content_types": [
|
||||
{
|
||||
"id": "email_html",
|
||||
"label": "Email HTML",
|
||||
"detection_hint": "HTML file with From/To/Subject headers",
|
||||
"preprocessing": "email_html",
|
||||
"extraction_prompt": (
|
||||
"If the email contains a direct action request or task assignment → create a task. "
|
||||
"If the email contains informational content, updates, or FYI → create a note. "
|
||||
"If the email mentions a specific date for a meeting or deadline → create a timeline entry."
|
||||
),
|
||||
}
|
||||
],
|
||||
"global_rules": [
|
||||
"Se il file non è riconducibile a nessun progetto, non creare alcuna entità."
|
||||
],
|
||||
"data_types": ["tasks", "notes", "timelines"],
|
||||
}
|
||||
|
||||
_PROJECT_ALPHA = {"id": "proj-alpha", "name": "Project Alpha", "status": "active"}
|
||||
_PROJECT_BETA = {"id": "proj-beta", "name": "Project Beta", "status": "active"}
|
||||
|
||||
# ── Sample email content ──────────────────────────────────────────────────
|
||||
|
||||
_ACTION_EMAIL = """\
|
||||
<html><head></head><body>
|
||||
<p><b>From:</b> boss@company.com</p>
|
||||
<p><b>To:</b> dev@company.com</p>
|
||||
<p><b>Subject:</b> Fix the login bug</p>
|
||||
<p><b>Date:</b> 2026-04-07</p>
|
||||
<p>Hi,<br>Please fix the login bug in Project Alpha by Friday. High priority!</p>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
_INFO_EMAIL = """\
|
||||
<html><head></head><body>
|
||||
<p><b>From:</b> pm@company.com</p>
|
||||
<p><b>To:</b> team@company.com</p>
|
||||
<p><b>Subject:</b> FYI: New policy for Project Alpha</p>
|
||||
<p>Just a heads-up that starting next week all code reviews must be done
|
||||
within 24 hours for Project Alpha. No action needed from you now.</p>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
_DATE_EMAIL = """\
|
||||
<html><head></head><body>
|
||||
<p><b>From:</b> pm@company.com</p>
|
||||
<p><b>Subject:</b> Project Alpha kick-off meeting</p>
|
||||
<p>The kick-off meeting for Project Alpha is scheduled for 2026-04-15 at 10:00.</p>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
_NO_PROJECT_EMAIL = """\
|
||||
<html><head></head><body>
|
||||
<p><b>From:</b> newsletter@ads.com</p>
|
||||
<p><b>Subject:</b> Weekly newsletter</p>
|
||||
<p>Check out our latest deals on electronics!</p>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
_EXISTING_TASK = {
|
||||
"id": "task-existing",
|
||||
"title": "Fix the login bug",
|
||||
"status": "todo",
|
||||
"priority": "medium",
|
||||
}
|
||||
|
||||
|
||||
# ── Test helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _make_config(
|
||||
agent_config: dict | None = None,
|
||||
directory: str = "/emails",
|
||||
device_id: str = "dev-001",
|
||||
) -> LocalAgentConfig:
|
||||
return LocalAgentConfig(
|
||||
id=str(uuid.uuid4()),
|
||||
user_id=_USER_ID,
|
||||
device_id=device_id,
|
||||
name="Test V2 Agent",
|
||||
directory_paths=[directory],
|
||||
data_types=["tasks", "notes", "timelines"],
|
||||
prompt_template="",
|
||||
agent_config=agent_config or _AGENT_CONFIG,
|
||||
file_extensions=[".html", ".eml"],
|
||||
schedule_cron="0 */6 * * *",
|
||||
enabled=True,
|
||||
last_run_at=None,
|
||||
)
|
||||
|
||||
|
||||
def _make_run_log(agent_id: str) -> AgentRunLog:
|
||||
return AgentRunLog(
|
||||
id=str(uuid.uuid4()),
|
||||
agent_id=agent_id,
|
||||
agent_type="local",
|
||||
user_id=_USER_ID,
|
||||
status="running",
|
||||
started_at=datetime.now(timezone.utc),
|
||||
)
|
||||
|
||||
|
||||
def _make_manager(online: bool = True) -> DeviceConnectionManager:
|
||||
mgr = DeviceConnectionManager()
|
||||
if online:
|
||||
ws = MagicMock()
|
||||
ws.send_text = AsyncMock()
|
||||
mgr.register(_USER_ID, "dev-001", ws)
|
||||
return mgr
|
||||
|
||||
|
||||
def _make_executor(
|
||||
file_path: str,
|
||||
file_content: str,
|
||||
projects: list[dict] | None = None,
|
||||
existing_tasks: list[dict] | None = None,
|
||||
existing_notes: list[dict] | None = None,
|
||||
existing_timelines: list[dict] | None = None,
|
||||
) -> tuple[Any, list[dict]]:
|
||||
"""Return (async_executor, captured_calls).
|
||||
|
||||
The executor handles all ``execute_on_client`` payloads:
|
||||
directory listing, file reading, project/entity fetching, and CRUD.
|
||||
"""
|
||||
calls: list[dict] = []
|
||||
_projects = projects or [_PROJECT_ALPHA, _PROJECT_BETA]
|
||||
|
||||
async def _executor(payload: dict) -> dict:
|
||||
action = payload.get("action", "")
|
||||
table = payload.get("table", "")
|
||||
data = payload.get("data") or {}
|
||||
calls.append({"action": action, "table": table, "data": data})
|
||||
|
||||
if action == "list_directory":
|
||||
path = data.get("path", "") or payload.get("data", {}).get("path", "")
|
||||
return {
|
||||
"entries": [{"type": "file", "path": file_path}]
|
||||
}
|
||||
|
||||
if action == "get_file_metadata":
|
||||
return {"modifiedAt": None}
|
||||
|
||||
if action == "read_file_content":
|
||||
return {"content": file_content}
|
||||
|
||||
if action == "select":
|
||||
if table == "projects":
|
||||
return {"rows": _projects}
|
||||
if table == "tasks":
|
||||
return {"rows": existing_tasks or []}
|
||||
if table == "notes":
|
||||
return {"rows": existing_notes or []}
|
||||
if table == "timelines":
|
||||
return {"rows": existing_timelines or []}
|
||||
return {"rows": []}
|
||||
|
||||
if action == "insert":
|
||||
return {"row": {"id": str(uuid.uuid4()), **data}}
|
||||
|
||||
if action == "update":
|
||||
return {"success": True}
|
||||
|
||||
return {}
|
||||
|
||||
return _executor, calls
|
||||
|
||||
|
||||
# ── Unit: helper functions ────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_format_projects_empty():
|
||||
assert "(no projects" in _format_projects([])
|
||||
|
||||
|
||||
def test_format_projects_with_data():
|
||||
result = _format_projects([_PROJECT_ALPHA])
|
||||
assert "proj-alpha" in result
|
||||
assert "Project Alpha" in result
|
||||
|
||||
|
||||
def test_format_metadata_empty():
|
||||
assert _format_metadata({}) == ""
|
||||
|
||||
|
||||
def test_format_metadata_email():
|
||||
meta = {"subject": "Fix bug", "from": "boss@co.com", "date": "2026-04-07"}
|
||||
result = _format_metadata(meta)
|
||||
assert "Fix bug" in result
|
||||
assert "boss@co.com" in result
|
||||
|
||||
|
||||
def test_get_extraction_rules_match():
|
||||
rules = _get_extraction_rules(_AGENT_CONFIG, "email_html")
|
||||
assert "task" in rules.lower()
|
||||
|
||||
|
||||
def test_get_extraction_rules_fallback():
|
||||
rules = _get_extraction_rules(_AGENT_CONFIG, "plain_text")
|
||||
assert "extract" in rules.lower()
|
||||
|
||||
|
||||
def test_get_no_match_behavior_from_global_rules():
|
||||
behavior = _get_no_match_behavior(_AGENT_CONFIG)
|
||||
# The global rule says "non creare alcuna entità" → skip behavior
|
||||
assert behavior # non-empty
|
||||
|
||||
|
||||
def test_get_no_match_behavior_default():
|
||||
behavior = _get_no_match_behavior({})
|
||||
assert "project" in behavior.lower()
|
||||
|
||||
|
||||
# ── Unit: 2.9 — device offline ───────────────────────────────────────────
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_2_9_device_offline():
|
||||
"""2.9 No device online → status=error, no executor created."""
|
||||
config = _make_config()
|
||||
run_log = _make_run_log(config.id)
|
||||
mgr = _make_manager(online=False)
|
||||
|
||||
with patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
|
||||
await run_local_agent(_USER_ID, config, run_log, mgr)
|
||||
|
||||
_, kwargs = mock_fin.call_args
|
||||
assert kwargs["status"] == "error"
|
||||
assert any("not connected" in e for e in kwargs.get("errors", []))
|
||||
|
||||
|
||||
# ── Unit: 2.10 — empty file ──────────────────────────────────────────────
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_2_10_empty_file():
|
||||
"""2.10 File with empty content → skipped, items_processed=0, success."""
|
||||
config = _make_config()
|
||||
run_log = _make_run_log(config.id)
|
||||
mgr = _make_manager()
|
||||
|
||||
executor, calls = _make_executor(
|
||||
file_path="/emails/empty.html",
|
||||
file_content="", # empty
|
||||
projects=[_PROJECT_ALPHA],
|
||||
)
|
||||
|
||||
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
|
||||
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
|
||||
await run_local_agent(_USER_ID, config, run_log, mgr)
|
||||
|
||||
_, kwargs = mock_fin.call_args
|
||||
assert kwargs["items_processed"] == 0
|
||||
assert kwargs["status"] == "success"
|
||||
assert kwargs["items_created"] == 0
|
||||
|
||||
|
||||
# ── Unit: 2.8 — items_created count ─────────────────────────────────────
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_2_8_items_created_count():
|
||||
"""2.8 items_created == number of create_* tool calls per run."""
|
||||
config = _make_config()
|
||||
run_log = _make_run_log(config.id)
|
||||
mgr = _make_manager()
|
||||
|
||||
executor, _calls = _make_executor(
|
||||
file_path="/emails/action.html",
|
||||
file_content=_ACTION_EMAIL,
|
||||
projects=[_PROJECT_ALPHA],
|
||||
)
|
||||
|
||||
# Simulate LLM calling create_task twice and update_note once.
|
||||
async def mock_run_agent(*, _tool_calls_out=None, **kw) -> str:
|
||||
if _tool_calls_out is not None:
|
||||
_tool_calls_out.extend(["create_task", "create_note", "update_task"])
|
||||
return "Done."
|
||||
|
||||
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
|
||||
patch("app.core.agent_runner._run_agent_with_tools", side_effect=mock_run_agent), \
|
||||
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
|
||||
await run_local_agent(_USER_ID, config, run_log, mgr)
|
||||
|
||||
_, kwargs = mock_fin.call_args
|
||||
# Only create_task + create_note count (not update_task).
|
||||
assert kwargs["items_created"] == 2
|
||||
assert kwargs["items_processed"] == 1
|
||||
|
||||
|
||||
# ── Eval: 2.1–2.7 (real LLM + Langfuse scoring) ──────────────────────────
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.eval
|
||||
async def test_2_1_email_to_task():
|
||||
"""2.1 Action email → LLM calls create_task. Score: runner.email_to_task."""
|
||||
lf = get_langfuse()
|
||||
trace = lf.trace(
|
||||
name="eval-runner-2.1-email-to-task",
|
||||
metadata={"step": "2"},
|
||||
) if lf else None
|
||||
|
||||
config = _make_config()
|
||||
run_log = _make_run_log(config.id)
|
||||
mgr = _make_manager()
|
||||
|
||||
executor, calls = _make_executor(
|
||||
file_path="/emails/ProjectAlpha_action.html",
|
||||
file_content=_ACTION_EMAIL,
|
||||
projects=[_PROJECT_ALPHA, _PROJECT_BETA],
|
||||
)
|
||||
|
||||
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
|
||||
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
|
||||
await run_local_agent(_USER_ID, config, run_log, mgr)
|
||||
|
||||
_, kwargs = mock_fin.call_args
|
||||
task_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "tasks"]
|
||||
score = 1.0 if len(task_creates) >= 1 else 0.0
|
||||
|
||||
if lf and trace:
|
||||
lf.score(
|
||||
trace_id=trace.id,
|
||||
name="runner.email_to_task",
|
||||
value=score,
|
||||
comment=f"task_creates={len(task_creates)} items_created={kwargs.get('items_created')}",
|
||||
)
|
||||
lf.flush()
|
||||
|
||||
assert score == 1.0, f"Expected at least 1 task created, got {len(task_creates)}"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.eval
|
||||
async def test_2_2_email_to_note():
|
||||
"""2.2 Informational email → LLM calls create_note. Score: runner.email_to_note."""
|
||||
lf = get_langfuse()
|
||||
trace = lf.trace(name="eval-runner-2.2-email-to-note", metadata={"step": "2"}) if lf else None
|
||||
|
||||
config = _make_config()
|
||||
run_log = _make_run_log(config.id)
|
||||
mgr = _make_manager()
|
||||
|
||||
executor, calls = _make_executor(
|
||||
file_path="/emails/ProjectAlpha_info.html",
|
||||
file_content=_INFO_EMAIL,
|
||||
projects=[_PROJECT_ALPHA, _PROJECT_BETA],
|
||||
)
|
||||
|
||||
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
|
||||
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
|
||||
await run_local_agent(_USER_ID, config, run_log, mgr)
|
||||
|
||||
note_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "notes"]
|
||||
score = 1.0 if len(note_creates) >= 1 else 0.0
|
||||
|
||||
if lf and trace:
|
||||
lf.score(trace_id=trace.id, name="runner.email_to_note", value=score,
|
||||
comment=f"note_creates={len(note_creates)}")
|
||||
lf.flush()
|
||||
|
||||
assert score == 1.0, f"Expected at least 1 note created, got {len(note_creates)}"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.eval
|
||||
async def test_2_3_email_to_timeline():
|
||||
"""2.3 Email with event date → LLM calls create_timeline. Score: runner.email_to_timeline."""
|
||||
lf = get_langfuse()
|
||||
trace = lf.trace(name="eval-runner-2.3-email-to-timeline", metadata={"step": "2"}) if lf else None
|
||||
|
||||
config = _make_config()
|
||||
run_log = _make_run_log(config.id)
|
||||
mgr = _make_manager()
|
||||
|
||||
executor, calls = _make_executor(
|
||||
file_path="/emails/ProjectAlpha_kickoff.html",
|
||||
file_content=_DATE_EMAIL,
|
||||
projects=[_PROJECT_ALPHA, _PROJECT_BETA],
|
||||
)
|
||||
|
||||
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
|
||||
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
|
||||
await run_local_agent(_USER_ID, config, run_log, mgr)
|
||||
|
||||
tl_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "timelines"]
|
||||
score = 1.0 if len(tl_creates) >= 1 else 0.0
|
||||
|
||||
if lf and trace:
|
||||
lf.score(trace_id=trace.id, name="runner.email_to_timeline", value=score,
|
||||
comment=f"timeline_creates={len(tl_creates)}")
|
||||
lf.flush()
|
||||
|
||||
assert score == 1.0, f"Expected at least 1 timeline created, got {len(tl_creates)}"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.eval
|
||||
async def test_2_4_project_matching_filename():
|
||||
"""2.4 Filename contains 'ProjectAlpha' → LLM assigns to proj-alpha. Score: runner.project_filename."""
|
||||
lf = get_langfuse()
|
||||
trace = lf.trace(name="eval-runner-2.4-project-filename", metadata={"step": "2"}) if lf else None
|
||||
|
||||
config = _make_config()
|
||||
run_log = _make_run_log(config.id)
|
||||
mgr = _make_manager()
|
||||
|
||||
executor, calls = _make_executor(
|
||||
file_path="/emails/ProjectAlpha_report.html",
|
||||
file_content=_ACTION_EMAIL,
|
||||
projects=[_PROJECT_ALPHA, _PROJECT_BETA],
|
||||
)
|
||||
|
||||
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
|
||||
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
|
||||
await run_local_agent(_USER_ID, config, run_log, mgr)
|
||||
|
||||
# Check that project_id = proj-alpha was used in any insert
|
||||
inserts = [c for c in calls if c["action"] == "insert"]
|
||||
correct_project = any(
|
||||
c.get("data", {}).get("projectId") == "proj-alpha"
|
||||
for c in inserts
|
||||
)
|
||||
score = 1.0 if correct_project else 0.0
|
||||
|
||||
if lf and trace:
|
||||
lf.score(trace_id=trace.id, name="runner.project_filename", value=score)
|
||||
lf.flush()
|
||||
|
||||
assert score == 1.0, "Expected inserts to use proj-alpha based on filename"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.eval
|
||||
async def test_2_5_project_matching_content():
|
||||
"""2.5 Email body mentions 'Project Alpha' → correct project assigned. Score: runner.project_content."""
|
||||
lf = get_langfuse()
|
||||
trace = lf.trace(name="eval-runner-2.5-project-content", metadata={"step": "2"}) if lf else None
|
||||
|
||||
config = _make_config()
|
||||
run_log = _make_run_log(config.id)
|
||||
mgr = _make_manager()
|
||||
|
||||
executor, calls = _make_executor(
|
||||
file_path="/emails/email_001.html", # generic filename, no project hint
|
||||
file_content=_ACTION_EMAIL, # body mentions "Project Alpha"
|
||||
projects=[_PROJECT_ALPHA, _PROJECT_BETA],
|
||||
)
|
||||
|
||||
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
|
||||
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
|
||||
await run_local_agent(_USER_ID, config, run_log, mgr)
|
||||
|
||||
inserts = [c for c in calls if c["action"] == "insert"]
|
||||
correct_project = any(
|
||||
c.get("data", {}).get("projectId") == "proj-alpha"
|
||||
for c in inserts
|
||||
)
|
||||
score = 1.0 if correct_project else 0.0
|
||||
|
||||
if lf and trace:
|
||||
lf.score(trace_id=trace.id, name="runner.project_content", value=score)
|
||||
lf.flush()
|
||||
|
||||
assert score == 1.0, "Expected inserts to use proj-alpha based on email body content"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.eval
|
||||
async def test_2_6_no_project_match_global_rule():
|
||||
"""2.6 Newsletter email + global rule 'no project = no entities' → no creates. Score: runner.no_project."""
|
||||
lf = get_langfuse()
|
||||
trace = lf.trace(name="eval-runner-2.6-no-project", metadata={"step": "2"}) if lf else None
|
||||
|
||||
config = _make_config()
|
||||
run_log = _make_run_log(config.id)
|
||||
mgr = _make_manager()
|
||||
|
||||
executor, calls = _make_executor(
|
||||
file_path="/emails/newsletter.html",
|
||||
file_content=_NO_PROJECT_EMAIL,
|
||||
projects=[_PROJECT_ALPHA, _PROJECT_BETA],
|
||||
)
|
||||
|
||||
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
|
||||
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock) as mock_fin:
|
||||
await run_local_agent(_USER_ID, config, run_log, mgr)
|
||||
|
||||
_, kwargs = mock_fin.call_args
|
||||
inserts = [c for c in calls if c["action"] == "insert"]
|
||||
score = 1.0 if len(inserts) == 0 else 0.0
|
||||
|
||||
if lf and trace:
|
||||
lf.score(trace_id=trace.id, name="runner.no_project", value=score,
|
||||
comment=f"inserts={len(inserts)}")
|
||||
lf.flush()
|
||||
|
||||
assert score == 1.0, f"Expected 0 inserts for unmatched newsletter, got {len(inserts)}"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.eval
|
||||
async def test_2_7_deduplication():
|
||||
"""2.7 Existing task with same title → LLM calls update_task, not create_task. Score: runner.dedup."""
|
||||
lf = get_langfuse()
|
||||
trace = lf.trace(name="eval-runner-2.7-dedup", metadata={"step": "2"}) if lf else None
|
||||
|
||||
config = _make_config()
|
||||
run_log = _make_run_log(config.id)
|
||||
mgr = _make_manager()
|
||||
|
||||
executor, calls = _make_executor(
|
||||
file_path="/emails/ProjectAlpha_followup.html",
|
||||
file_content=_ACTION_EMAIL, # "Fix the login bug" — already exists
|
||||
projects=[_PROJECT_ALPHA],
|
||||
existing_tasks=[_EXISTING_TASK], # task already exists
|
||||
)
|
||||
|
||||
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
|
||||
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
|
||||
await run_local_agent(_USER_ID, config, run_log, mgr)
|
||||
|
||||
task_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "tasks"]
|
||||
task_updates = [c for c in calls if c["action"] == "update" and c.get("table") == "tasks"]
|
||||
# Prefer update over create
|
||||
score = 1.0 if len(task_creates) == 0 or len(task_updates) >= 1 else 0.0
|
||||
|
||||
if lf and trace:
|
||||
lf.score(trace_id=trace.id, name="runner.dedup", value=score,
|
||||
comment=f"creates={len(task_creates)} updates={len(task_updates)}")
|
||||
lf.flush()
|
||||
|
||||
assert score == 1.0, (
|
||||
f"Expected deduplication: creates={len(task_creates)}, updates={len(task_updates)}"
|
||||
)
|
||||
Reference in New Issue
Block a user