@@ -4,32 +4,36 @@ Covers the unified per-file flow:
Phase A — detect + preprocess (Python, zero LLM)
Phase B — single LLM call with tools (classify + extract + create)
Test cases:
2.1 Happy path: email with action → create_task called
2.2 Happy path: email informative → create_note called
2.3 Happy path: email with date → create_timeline called
2.4 Projec t m atching via filename → correct project_id used
2.5 Project matching via content → correct project_id used
2.6 No project match + global rule → no create_* called
2.7 Deduplication → update_task, not create_task
2.8 items_created count (unit) → items_created == N create_* calls
2.9 Device offline (unit) → status=error
2.10 Empty file (unit) → items_processed=0, status=success
Fixture-based eval tests (2.1– 2.7)
-----------------------------------
Cases are defined in tests/fixtures/agent_runner_v2/cases.yaml.
Email HTML files live in tests/fixtures/agent_runner_v2/data/.
Use --runner-dir to poin t at a custom folder (same structure required).
Unit tests (no LLM)
--------------------
2.8 items_created count → items_created == N create_* calls
2.9 Device offline → status=error
2.10 Empty file → items_processed=0, status=success
Run:
pytest tests/test_agent_runner_v2.py -v
pytest tests/test_agent_runner_v2.py -v -k " 2_9 or 2_10 or 2_8 " # unit only
pytest tests/test_agent_runner_v2.py -v -k " eval " # LLM evals only
pytest tests/test_agent_runner_v2.py -v --runner-dir /path/to/dir # custom fixtures
"""
from __future__ import annotations
import uuid
from contextlib import nullcontext
from datetime import datetime , timezone
from pathlib import Path
from typing import Any
from unittest . mock import AsyncMock , MagicMock , patch
import pytest
import yaml
from app . core . agent_runner import (
_format_metadata ,
@@ -40,7 +44,7 @@ from app.core.agent_runner import (
run_local_agent ,
)
from app . core . device_manager import DeviceConnectionManager
from app . core . langfuse_client import get_langfuse , get_prompt_or_fallback
from app . core . langfuse_client import get_langfuse
from app . models import AgentRunLog , LocalAgentConfig
from tests . conftest import TEST_USER_IDS
@@ -48,6 +52,8 @@ from tests.conftest import TEST_USER_IDS
_USER_ID = TEST_USER_IDS [ " power " ]
_DEFAULT_FIXTURE_DIR = Path ( __file__ ) . parent / " fixtures " / " agent_runner_v2 "
_AGENT_CONFIG = {
" content_types " : [
{
@@ -68,55 +74,53 @@ _AGENT_CONFIG = {
" data_types " : [ " tasks " , " notes " , " timelines " ] ,
}
_PROJECT_ALPHA = { " id " : " proj-alpha " , " name " : " Project Alpha " , " status " : " active " }
_PROJECT_BETA = { " id " : " proj-beta " , " name " : " Project Beta " , " status " : " active " }
# ── Sample email content ──────────────────────────────────────────────────
_ACTION_EMAIL = """ \
<html><head></head><body>
<p><b>From:</b> boss@company.com</p>
<p><b>To:</b> dev@company.com</p>
<p><b>Subject:</b> Fix the login bug</p>
<p><b>Date:</b> 2026-04-07</p>
<p>Hi,<br>Please fix the login bug in Project Alpha by Friday. High priority!</p>
</body></html>
"""
_INFO_EMAIL = """ \
<html><head></head><body>
<p><b>From:</b> pm@company.com</p>
<p><b>To:</b> team@company.com</p>
<p><b>Subject:</b> FYI: New policy for Project Alpha</p>
<p>Just a heads-up that starting next week all code reviews must be done
within 24 hours for Project Alpha. No action needed from you now.</p>
</body></html>
"""
_DATE_EMAIL = """ \
<html><head></head><body>
<p><b>From:</b> pm@company.com</p>
<p><b>Subject:</b> Project Alpha kick-off meeting</p>
<p>The kick-off meeting for Project Alpha is scheduled for 2026-04-15 at 10:00.</p>
</body></html>
"""
_NO_PROJECT_EMAIL = """ \
<html><head></head><body>
<p><b>From:</b> newsletter@ads.com</p>
<p><b>Subject:</b> Weekly newsletter</p>
<p>Check out our latest deals on electronics!</p>
</body></html>
"""
_EXISTING_TASK = {
" id " : " task-existing " ,
" title " : " Fix the login bug " ,
" status " : " todo " ,
" priority " : " medium " ,
# Canonical project definitions, referenced symbolically in cases.yaml.
_PROJECTS : dict [ str , dict ] = {
" alpha " : { " id " : " proj-alpha " , " name " : " Project Alpha " , " status " : " active " } ,
" beta " : { " id " : " proj-beta " , " name " : " Project Beta " , " status " : " active " } ,
}
# ── Fixture loading ───────────────────────────────────────────────────────
def _fixtures_dir ( config ) - > Path :
override = config . getoption ( " --runner-dir " )
return Path ( override ) if override else _DEFAULT_FIXTURE_DIR
def _load_cases ( config ) - > list [ dict ] :
return yaml . safe_load (
( _fixtures_dir ( config ) / " cases.yaml " ) . read_text ( encoding = " utf-8 " )
)
def _read_case_file ( case : dict , data_dir : Path ) - > str :
return ( data_dir / case [ " file " ] ) . read_text ( encoding = " utf-8 " )
def _resolve_projects ( entries : list [ str | dict ] ) - > list [ dict ] :
""" Resolve project list from YAML: symbolic names and/or inline dicts. """
result = [ ]
for entry in entries :
if isinstance ( entry , str ) :
if entry in _PROJECTS :
result . append ( _PROJECTS [ entry ] )
elif isinstance ( entry , dict ) :
result . append ( entry )
return result
# ── pytest_generate_tests — parametrize eval tests from YAML ─────────────
def pytest_generate_tests ( metafunc ) :
if " runner_case " not in metafunc . fixturenames :
return
cases = _load_cases ( metafunc . config )
metafunc . parametrize ( " runner_case " , cases , ids = [ c [ " id " ] for c in cases ] )
# ── Test helpers ──────────────────────────────────────────────────────────
@@ -175,7 +179,7 @@ def _make_executor(
directory listing, file reading, project/entity fetching, and CRUD.
"""
calls : list [ dict ] = [ ]
_projects = projects or [ _PROJECT_ALPHA , _PROJECT_BETA ]
_projects = projects if projects is not None else list ( _PROJECTS . values ( ) )
async def _executor ( payload : dict ) - > dict :
action = payload . get ( " action " , " " )
@@ -184,10 +188,7 @@ def _make_executor(
calls . append ( { " action " : action , " table " : table , " data " : data } )
if action == " list_directory " :
path = data . get ( " path " , " " ) or payload . get ( " data " , { } ) . get ( " path " , " " )
return {
" entries " : [ { " type " : " file " , " path " : file_path } ]
}
return { " entries " : [ { " type " : " file " , " path " : file_path } ] }
if action == " get_file_metadata " :
return { " modifiedAt " : None }
@@ -225,7 +226,7 @@ def test_format_projects_empty():
def test_format_projects_with_data ( ) :
result = _format_projects ( [ _PROJECT_ALPHA ] )
result = _format_projects ( [ _PROJECTS [ " alpha " ] ] )
assert " proj-alpha " in result
assert " Project Alpha " in result
@@ -253,7 +254,6 @@ def test_get_extraction_rules_fallback():
def test_get_no_match_behavior_from_global_rules ( ) :
behavior = _get_no_match_behavior ( _AGENT_CONFIG )
# The global rule says "non creare alcuna entità" → skip behavior
assert behavior # non-empty
@@ -292,8 +292,8 @@ async def test_2_10_empty_file():
executor , calls = _make_executor (
file_path = " /emails/empty.html " ,
file_content = " " , # empty
projects = [ _PROJECT_ALPHA ] ,
file_content = " " ,
projects = [ _PROJECTS [ " alpha " ] ] ,
)
with patch ( " app.core.agent_runner._make_agent_executor " , return_value = executor ) , \
@@ -318,11 +318,10 @@ async def test_2_8_items_created_count():
executor , _calls = _make_executor (
file_path = " /emails/action.html " ,
file_content = _ACTION_EMAIL ,
projects = [ _PROJECT_ALPHA ] ,
file_content = " <html><body><p>Fix the login bug in Project Alpha.</p></body></html> " ,
projects = [ _PROJECTS [ " alpha " ] ] ,
)
# Simulate LLM calling create_task twice and update_note once.
async def mock_run_agent ( * , _tool_calls_out = None , * * kw ) - > str :
if _tool_calls_out is not None :
_tool_calls_out . extend ( [ " create_task " , " create_note " , " update_task " ] )
@@ -339,33 +338,43 @@ async def test_2_8_items_created_count():
assert kwargs [ " items_processed " ] == 1
# ── Eval: 2.1– 2.7 ( real LLM + Langfuse scoring) ──────────────── ──────────
# ── Eval: 2.1– 2.7 — fixture-driven, real LLM + Langfuse scoring ──────────
#
# Langfuse V3 pattern:
# lf.start_as_current_ob serva tion(name=...) as context manager → obs object
# obs.score(name=..., value=...) (not lf.score(trace_id=...))
# contextlib.nullcontext() when lf is None → obs is None, no-op
# Cases loaded from tests/fixtures/agent_runner_v2/cases.yaml.
# Supported as sertions (from YAML):
# expect_insert: <table> → at least 1 insert in that table
# expect_no_insert: true → zero inserts in any table
# expect_project_id: <id> → any insert carries this projectId
# expect_dedup: true → task inserts == 0 OR task updates >= 1
# ─────────────────────────────────────────────────────────────────────────
@pytest.mark.asyncio
@pytest.mark.eval
async def test_2_1_email_to_t ask ( ) :
""" 2.1 Action email → LLM calls create_task. Score: runner.email_to_t ask ."""
from contextlib import nullcontext
lf = get_langfuse ( )
async def test_eval_runner ( runner_c ase , pytestconfig ) :
""" Parametrized eval test — one invocation per YAML c ase ."""
case : dict = runner_case
data_dir = _fixtures_dir ( pytestconfig ) / " data "
file_content = _read_case_file ( case , data_dir )
projects = _resolve_projects ( case . get ( " projects " , [ ] ) )
config = _make_config ( )
run_log = _make_run_log ( config . id )
mgr = _make_manager ( )
executor , calls = _make_executor (
file_path = " /emails/ProjectAlpha_action.html " ,
file_content = _ACTION_EMAIL ,
projects = [ _PROJECT_ALPHA , _PROJECT_BETA ] ,
file_path = case [ " file_path " ] ,
file_content = file_content ,
projects = projects ,
existing_tasks = case . get ( " existing_tasks " ) ,
existing_notes = case . get ( " existing_notes " ) ,
existing_timelines = case . get ( " existing_timelines " ) ,
)
lf = get_langfuse ( )
obs_ctx = lf . start_as_current_observation (
name = " eval-runner-2.1-email-to-task " , metadata = { " step " : " 2 " }
name = f " eval-runner-{ case [ ' id ' ] } - { case . get ( ' score_name ' , ' unknown ' ) . replace ( ' . ' , ' - ' ) } " ,
metadata = { " step " : " 2 " , " case_id " : case [ " id " ] } ,
) if lf else nullcontext ( )
with obs_ctx as obs :
@@ -374,253 +383,50 @@ async def test_2_1_email_to_task():
await run_local_agent ( _USER_ID , config , run_log , mgr )
_ , kwargs = mock_fin . call_args
task_create s = [ c for c in calls if c [ " action " ] == " insert " and c [ " table " ] == " tasks " ]
score = 1.0 if len ( task_creates ) > = 1 else 0.0
insert s = [ c for c in calls if c [ " action " ] == " insert " ]
score , comment = _evaluate_case ( case , calls , kwargs )
if obs is not None :
obs . score (
name = " runner.email_to_task " ,
name = case . get ( " score_name " , f " runner.case_ { case [ ' id ' ] } " ) ,
value = score ,
comment = f " task_creates= { len ( task_creates ) } items_created= { kwargs . get ( ' items_created ' ) } " ,
comment = comment ,
)
if lf :
lf . flush ( )
assert score == 1.0 , f " Expected at least 1 task created, got { len ( task_creates ) } "
assert score == 1.0 , f " [ { case [ ' id ' ] } ] { case . get ( ' description ' , ' ' ) } — { comment } "
@pytest.mark.asyncio
@pytest.mark.eval
async def test_2_2_email_to_note ( ) :
""" 2.2 Informational email → LLM calls create_note. Score: runner.email_to_note. """
from contextlib import nullcontext
lf = get_langfuse ( )
def _evaluate_case ( case : dict , calls : list [ dict ] , finalize_kwargs : dict ) - > tuple [ float , str ] :
""" Return (score, comment) for a YAML case given the captured executor calls. """
inserts = [ c for c in calls if c [ " action " ] == " insert " ]
config = _make_config ( )
run_log = _make_run_log ( config . id )
mgr = _make_manager ( )
executor , calls = _make_executor (
file_path = " /emails/ProjectAlpha_info.html " ,
file_content = _INFO_EMAIL ,
projects = [ _PROJECT_ALPHA , _PROJECT_BETA ] ,
)
obs_ctx = lf . start_as_current_observation (
name = " eval-runner-2.2-email-to-note " , metadata = { " step " : " 2 " }
) if lf else nullcontext ( )
with obs_ctx as obs :
with patch ( " app.core.agent_runner._make_agent_executor " , return_value = executor ) , \
patch ( " app.core.agent_runner._finalize_run " , new_callable = AsyncMock ) :
await run_local_agent ( _USER_ID , config , run_log , mgr )
note_creates = [ c for c in calls if c [ " action " ] == " insert " and c [ " table " ] == " notes " ]
score = 1.0 if len ( note_creates ) > = 1 else 0.0
if obs is not None :
obs . score ( name = " runner.email_to_note " , value = score ,
comment = f " note_creates= { len ( note_creates ) } " )
if lf :
lf . flush ( )
assert score == 1.0 , f " Expected at least 1 note created, got { len ( note_creates ) } "
@pytest.mark.asyncio
@pytest.mark.eval
async def test_2_3_email_to_timeline ( ) :
""" 2.3 Email with event date → LLM calls create_timeline. Score: runner.email_to_timeline. """
from contextlib import nullcontext
lf = get_langfuse ( )
config = _make_config ( )
run_log = _make_run_log ( config . id )
mgr = _make_manager ( )
executor , calls = _make_executor (
file_path = " /emails/ProjectAlpha_kickoff.html " ,
file_content = _DATE_EMAIL ,
projects = [ _PROJECT_ALPHA , _PROJECT_BETA ] ,
)
obs_ctx = lf . start_as_current_observation (
name = " eval-runner-2.3-email-to-timeline " , metadata = { " step " : " 2 " }
) if lf else nullcontext ( )
with obs_ctx as obs :
with patch ( " app.core.agent_runner._make_agent_executor " , return_value = executor ) , \
patch ( " app.core.agent_runner._finalize_run " , new_callable = AsyncMock ) :
await run_local_agent ( _USER_ID , config , run_log , mgr )
tl_creates = [ c for c in calls if c [ " action " ] == " insert " and c [ " table " ] == " timelines " ]
score = 1.0 if len ( tl_creates ) > = 1 else 0.0
if obs is not None :
obs . score ( name = " runner.email_to_timeline " , value = score ,
comment = f " timeline_creates= { len ( tl_creates ) } " )
if lf :
lf . flush ( )
assert score == 1.0 , f " Expected at least 1 timeline created, got { len ( tl_creates ) } "
@pytest.mark.asyncio
@pytest.mark.eval
async def test_2_4_project_matching_filename ( ) :
""" 2.4 Filename contains ' ProjectAlpha ' → LLM assigns to proj-alpha. Score: runner.project_filename. """
from contextlib import nullcontext
lf = get_langfuse ( )
config = _make_config ( )
run_log = _make_run_log ( config . id )
mgr = _make_manager ( )
executor , calls = _make_executor (
file_path = " /emails/ProjectAlpha_report.html " ,
file_content = _ACTION_EMAIL ,
projects = [ _PROJECT_ALPHA , _PROJECT_BETA ] ,
)
obs_ctx = lf . start_as_current_observation (
name = " eval-runner-2.4-project-filename " , metadata = { " step " : " 2 " }
) if lf else nullcontext ( )
with obs_ctx as obs :
with patch ( " app.core.agent_runner._make_agent_executor " , return_value = executor ) , \
patch ( " app.core.agent_runner._finalize_run " , new_callable = AsyncMock ) :
await run_local_agent ( _USER_ID , config , run_log , mgr )
inserts = [ c for c in calls if c [ " action " ] == " insert " ]
correct_project = any (
c . get ( " data " , { } ) . get ( " projectId " ) == " proj-alpha " for c in inserts
)
score = 1.0 if correct_project else 0.0
if obs is not None :
obs . score ( name = " runner.project_filename " , value = score )
if lf :
lf . flush ( )
assert score == 1.0 , " Expected inserts to use proj-alpha based on filename "
@pytest.mark.asyncio
@pytest.mark.eval
async def test_2_5_project_matching_content ( ) :
""" 2.5 Email body mentions ' Project Alpha ' → correct project assigned. Score: runner.project_content. """
from contextlib import nullcontext
lf = get_langfuse ( )
config = _make_config ( )
run_log = _make_run_log ( config . id )
mgr = _make_manager ( )
executor , calls = _make_executor (
file_path = " /emails/email_001.html " , # generic filename, no project hint
file_content = _ACTION_EMAIL , # body mentions "Project Alpha"
projects = [ _PROJECT_ALPHA , _PROJECT_BETA ] ,
)
obs_ctx = lf . start_as_current_observation (
name = " eval-runner-2.5-project-content " , metadata = { " step " : " 2 " }
) if lf else nullcontext ( )
with obs_ctx as obs :
with patch ( " app.core.agent_runner._make_agent_executor " , return_value = executor ) , \
patch ( " app.core.agent_runner._finalize_run " , new_callable = AsyncMock ) :
await run_local_agent ( _USER_ID , config , run_log , mgr )
inserts = [ c for c in calls if c [ " action " ] == " insert " ]
correct_project = any (
c . get ( " data " , { } ) . get ( " projectId " ) == " proj-alpha " for c in inserts
)
score = 1.0 if correct_project else 0.0
if obs is not None :
obs . score ( name = " runner.project_content " , value = score )
if lf :
lf . flush ( )
assert score == 1.0 , " Expected inserts to use proj-alpha based on email body content "
@pytest.mark.asyncio
@pytest.mark.eval
async def test_2_6_no_project_match_global_rule ( ) :
""" 2.6 Newsletter email + global rule ' no project = no entities ' → no creates. Score: runner.no_project. """
from contextlib import nullcontext
lf = get_langfuse ( )
config = _make_config ( )
run_log = _make_run_log ( config . id )
mgr = _make_manager ( )
executor , calls = _make_executor (
file_path = " /emails/newsletter.html " ,
file_content = _NO_PROJECT_EMAIL ,
projects = [ _PROJECT_ALPHA , _PROJECT_BETA ] ,
)
obs_ctx = lf . start_as_current_observation (
name = " eval-runner-2.6-no-project " , metadata = { " step " : " 2 " }
) if lf else nullcontext ( )
with obs_ctx as obs :
with patch ( " app.core.agent_runner._make_agent_executor " , return_value = executor ) , \
patch ( " app.core.agent_runner._finalize_run " , new_callable = AsyncMock ) :
await run_local_agent ( _USER_ID , config , run_log , mgr )
inserts = [ c for c in calls if c [ " action " ] == " insert " ]
if case . get ( " expect_no_insert " ) :
score = 1.0 if len ( inserts ) == 0 else 0.0
return score , f " inserts= { len ( inserts ) } (expected 0) "
if obs is not Non e:
obs . score ( name = " runner.no_project " , value = score ,
comment = f " inserts= { len ( inserts ) } " )
if " expect_insert " in cas e:
tables = case [ " expect_insert " ]
if isinstance ( tab les , str ) :
tables = [ tables ]
missing = [ t for t in tables if not any ( c [ " table " ] == t for c in inserts ) ]
score = 1.0 if not missing else 0.0
counts = { t : sum ( 1 for c in inserts if c [ " table " ] == t ) for t in tables }
return score , f " inserts= { counts } " + ( f " missing= { missing } " if missing else " " )
if lf :
lf . flush ( )
if " expect_project_id " in case :
expected_pid = case [ " expect_project_id " ]
correct = any ( c . get ( " data " , { } ) . get ( " projectId " ) == expected_pid for c in inserts )
score = 1.0 if correct else 0.0
all_pids = [ c . get ( " data " , { } ) . get ( " projectId " ) for c in inserts ]
return score , f " projectIds= { all_pids } (expected { expected_pid !r} ) "
assert score == 1.0 , f " E xpected 0 inserts for unmatched newsletter, got { len ( inserts ) } "
@pytest.mark.asyncio
@pytest.mark.eval
async def test_2_7_deduplication ( ) :
""" 2.7 Existing task with same title → LLM calls update_task, not create_task. Score: runner.dedup. """
from contextlib import nullcontext
lf = get_langfuse ( )
config = _make_config ( )
run_log = _make_run_log ( config . id )
mgr = _make_manager ( )
executor , calls = _make_executor (
file_path = " /emails/ProjectAlpha_followup.html " ,
file_content = _ACTION_EMAIL , # "Fix the login bug" — already exists
projects = [ _PROJECT_ALPHA ] ,
existing_tasks = [ _EXISTING_TASK ] , # task already exists
)
obs_ctx = lf . start_as_current_observation (
name = " eval-runner-2.7-dedup " , metadata = { " step " : " 2 " }
) if lf else nullcontext ( )
with obs_ctx as obs :
with patch ( " app.core.agent_runner._make_agent_executor " , return_value = executor ) , \
patch ( " app.core.agent_runner._finalize_run " , new_callable = AsyncMock ) :
await run_local_agent ( _USER_ID , config , run_log , mgr )
task_creates = [ c for c in calls if c [ " action " ] == " insert " and c [ " table " ] == " tasks " ]
task_updates = [ c for c in calls if c [ " action " ] == " update " and c . get ( " table " ) == " tasks " ]
if case . get ( " e xpect_dedup " ) :
task_creates = [ c for c in inserts if c [ " table " ] == " tasks " ]
task_updates = [ c for c in calls if c [ " action " ] == " update " and c [ " table " ] == " tasks " ]
score = 1.0 if len ( task_creates ) == 0 or len ( task_updates ) > = 1 else 0.0
return score , f " task_creates= { len ( task_creates ) } task_updates= { len ( task_updates ) } "
if obs is not None :
obs . score ( name = " runner.dedup " , value = score ,
comment = f " creates= { len ( task_creates ) } updates= { len ( task_updates ) } " )
if lf :
lf . flush ( )
assert score == 1.0 , (
f " Expected deduplication: creates= { len ( task_creates ) } , updates= { len ( task_updates ) } "
)
return 0.0 , " no assertion defined in case "