fix(langfuse): remove invalid user_id/session_id kwargs from start_as_current_observation

Langfuse V3 does not accept user_id/session_id on observation-level calls.
Moved to metadata dict in agent_runner, deep_agent, and agent_setup.

refactor(tests): fixture-based pattern for agent_runner_v2 eval tests

- cases.yaml + data/ fixtures under tests/fixtures/agent_runner_v2/
- pytest_generate_tests parametrizes test_eval_runner from YAML
- _resolve_projects() handles symbolic names and inline dicts
- _evaluate_case() centralizes all assertion logic
- --runner-dir CLI option for custom fixture folders

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Roberto Musso
2026-04-08 00:45:15 +02:00
parent d8add7e8cb
commit e672b58b6f
9 changed files with 235 additions and 321 deletions

View File

@@ -175,7 +175,7 @@ def _build_system_prompt(
else "" else ""
) )
template, prompt_obj = get_prompt_or_fallback( template, prompt_obj = get_prompt_or_fallback(
"journey_system_v2", _JOURNEY_SYSTEM_PROMPT "journey_system", _JOURNEY_SYSTEM_PROMPT
) )
compiled = compile_prompt( compiled = compile_prompt(
template, template,

View File

@@ -251,7 +251,7 @@ async def _run_agent_with_tools(
lf.start_as_current_observation( lf.start_as_current_observation(
as_type="span", as_type="span",
name=agent_name, name=agent_name,
user_id=user_id or None, metadata={"user_id": user_id} if user_id else None,
input=user_message, input=user_message,
) )
if lf else None if lf else None

View File

@@ -615,8 +615,7 @@ async def _run_single_agent(
lf.start_as_current_observation( lf.start_as_current_observation(
as_type="span", as_type="span",
name=agent_name, name=agent_name,
user_id=user_id, metadata={"user_id": user_id, "session_id": trace_id},
session_id=trace_id,
input=message, input=message,
) )
if lf else None if lf else None
@@ -740,8 +739,7 @@ async def _run_single_agent_stream(
lf.start_as_current_observation( lf.start_as_current_observation(
as_type="span", as_type="span",
name=f"{agent_name}-stream", name=f"{agent_name}-stream",
user_id=user_id, metadata={"user_id": user_id, "session_id": trace_id},
session_id=trace_id,
input=message, input=message,
) )
if lf else None if lf else None

View File

@@ -0,0 +1,86 @@
# Agent Runner V2 — eval test cases (Step 2, requires real LLM)
#
# Each case drives one parametrized `test_eval_runner` invocation.
#
# Keys
# ----
# id: str unique identifier shown in pytest output
# description: str human-readable label
# file: str filename inside data/
# file_path: str path reported to the executor (affects project-matching via filename)
# projects: [alpha|beta] symbolic project names resolved by the test helper
#
# Optional pre-existing records (dedup tests)
# existing_tasks: list of {id, title, status, priority}
# existing_notes: list of {id, title, content}
# existing_timelines: list of {id, title, date}
#
# Assertions (one or more)
# expect_insert: <table> at least 1 insert row in this table (tasks|notes|timelines)
# expect_no_insert: true zero inserts in any table
# expect_project_id: <id> any insert must carry this projectId
# expect_dedup: true task inserts == 0 OR task updates >= 1 (dedup check)
#
# Langfuse
# score_name: str observation score name
- id: "2.1"
description: "Action email → create_task"
file: email_action.html
file_path: /emails/ProjectAlpha_action.html
projects: [alpha, beta]
expect_insert: tasks
score_name: runner.email_to_task
- id: "2.2"
description: "Informational email → create_note"
file: email_info.html
file_path: /emails/ProjectAlpha_info.html
projects: [alpha, beta]
expect_insert: notes
score_name: runner.email_to_note
- id: "2.3"
description: "Email with meeting date → create_timeline"
file: email_date.html
file_path: /emails/ProjectAlpha_kickoff.html
projects: [alpha, beta]
expect_insert: timelines
score_name: runner.email_to_timeline
- id: "2.4"
description: "Filename contains project name → correct project assigned"
file: email_action.html
file_path: /emails/ProjectAlpha_report.html
projects: [alpha, beta]
expect_project_id: proj-alpha
score_name: runner.project_filename
- id: "2.5"
description: "Email body mentions project → correct project assigned"
file: email_action.html
file_path: /emails/email_001.html
projects: [alpha, beta]
expect_project_id: proj-alpha
score_name: runner.project_content
- id: "2.6"
description: "Newsletter + global rule no-project → no creates"
file: email_no_project.html
file_path: /emails/newsletter.html
projects: [alpha, beta]
expect_no_insert: true
score_name: runner.no_project
- id: "2.7"
description: "Existing task with same title → dedup (update not create)"
file: email_action.html
file_path: /emails/ProjectAlpha_followup.html
projects: [alpha]
existing_tasks:
- id: task-existing
title: Fix the login bug
status: todo
priority: medium
expect_dedup: true
score_name: runner.dedup

View File

@@ -0,0 +1,7 @@
<html><head></head><body>
<p><b>From:</b> boss@company.com</p>
<p><b>To:</b> dev@company.com</p>
<p><b>Subject:</b> Fix the login bug</p>
<p><b>Date:</b> 2026-04-07</p>
<p>Hi,<br>Please fix the login bug in Project Alpha by Friday. High priority!</p>
</body></html>

View File

@@ -0,0 +1,5 @@
<html><head></head><body>
<p><b>From:</b> pm@company.com</p>
<p><b>Subject:</b> Project Alpha kick-off meeting</p>
<p>The kick-off meeting for Project Alpha is scheduled for 2026-04-15 at 10:00.</p>
</body></html>

View File

@@ -0,0 +1,7 @@
<html><head></head><body>
<p><b>From:</b> pm@company.com</p>
<p><b>To:</b> team@company.com</p>
<p><b>Subject:</b> FYI: New policy for Project Alpha</p>
<p>Just a heads-up that starting next week all code reviews must be done
within 24 hours for Project Alpha. No action needed from you now.</p>
</body></html>

View File

@@ -0,0 +1,5 @@
<html><head></head><body>
<p><b>From:</b> newsletter@ads.com</p>
<p><b>Subject:</b> Weekly newsletter</p>
<p>Check out our latest deals on electronics!</p>
</body></html>

View File

@@ -4,32 +4,36 @@ Covers the unified per-file flow:
Phase A — detect + preprocess (Python, zero LLM) Phase A — detect + preprocess (Python, zero LLM)
Phase B — single LLM call with tools (classify + extract + create) Phase B — single LLM call with tools (classify + extract + create)
Test cases: Fixture-based eval tests (2.12.7)
2.1 Happy path: email with action → create_task called -----------------------------------
2.2 Happy path: email informative → create_note called Cases are defined in tests/fixtures/agent_runner_v2/cases.yaml.
2.3 Happy path: email with date → create_timeline called Email HTML files live in tests/fixtures/agent_runner_v2/data/.
2.4 Project matching via filename → correct project_id used Use --runner-dir to point at a custom folder (same structure required).
2.5 Project matching via content → correct project_id used
2.6 No project match + global rule → no create_* called Unit tests (no LLM)
2.7 Deduplication → update_task, not create_task --------------------
2.8 items_created count (unit) → items_created == N create_* calls 2.8 items_created count → items_created == N create_* calls
2.9 Device offline (unit) → status=error 2.9 Device offline → status=error
2.10 Empty file (unit) → items_processed=0, status=success 2.10 Empty file → items_processed=0, status=success
Run: Run:
pytest tests/test_agent_runner_v2.py -v pytest tests/test_agent_runner_v2.py -v
pytest tests/test_agent_runner_v2.py -v -k "2_9 or 2_10 or 2_8" # unit only pytest tests/test_agent_runner_v2.py -v -k "2_9 or 2_10 or 2_8" # unit only
pytest tests/test_agent_runner_v2.py -v -k "eval" # LLM evals only pytest tests/test_agent_runner_v2.py -v -k "eval" # LLM evals only
pytest tests/test_agent_runner_v2.py -v --runner-dir /path/to/dir # custom fixtures
""" """
from __future__ import annotations from __future__ import annotations
import uuid import uuid
from contextlib import nullcontext
from datetime import datetime, timezone from datetime import datetime, timezone
from pathlib import Path
from typing import Any from typing import Any
from unittest.mock import AsyncMock, MagicMock, patch from unittest.mock import AsyncMock, MagicMock, patch
import pytest import pytest
import yaml
from app.core.agent_runner import ( from app.core.agent_runner import (
_format_metadata, _format_metadata,
@@ -40,7 +44,7 @@ from app.core.agent_runner import (
run_local_agent, run_local_agent,
) )
from app.core.device_manager import DeviceConnectionManager from app.core.device_manager import DeviceConnectionManager
from app.core.langfuse_client import get_langfuse, get_prompt_or_fallback from app.core.langfuse_client import get_langfuse
from app.models import AgentRunLog, LocalAgentConfig from app.models import AgentRunLog, LocalAgentConfig
from tests.conftest import TEST_USER_IDS from tests.conftest import TEST_USER_IDS
@@ -48,6 +52,8 @@ from tests.conftest import TEST_USER_IDS
_USER_ID = TEST_USER_IDS["power"] _USER_ID = TEST_USER_IDS["power"]
_DEFAULT_FIXTURE_DIR = Path(__file__).parent / "fixtures" / "agent_runner_v2"
_AGENT_CONFIG = { _AGENT_CONFIG = {
"content_types": [ "content_types": [
{ {
@@ -68,55 +74,53 @@ _AGENT_CONFIG = {
"data_types": ["tasks", "notes", "timelines"], "data_types": ["tasks", "notes", "timelines"],
} }
_PROJECT_ALPHA = {"id": "proj-alpha", "name": "Project Alpha", "status": "active"} # Canonical project definitions, referenced symbolically in cases.yaml.
_PROJECT_BETA = {"id": "proj-beta", "name": "Project Beta", "status": "active"} _PROJECTS: dict[str, dict] = {
"alpha": {"id": "proj-alpha", "name": "Project Alpha", "status": "active"},
# ── Sample email content ────────────────────────────────────────────────── "beta": {"id": "proj-beta", "name": "Project Beta", "status": "active"},
_ACTION_EMAIL = """\
<html><head></head><body>
<p><b>From:</b> boss@company.com</p>
<p><b>To:</b> dev@company.com</p>
<p><b>Subject:</b> Fix the login bug</p>
<p><b>Date:</b> 2026-04-07</p>
<p>Hi,<br>Please fix the login bug in Project Alpha by Friday. High priority!</p>
</body></html>
"""
_INFO_EMAIL = """\
<html><head></head><body>
<p><b>From:</b> pm@company.com</p>
<p><b>To:</b> team@company.com</p>
<p><b>Subject:</b> FYI: New policy for Project Alpha</p>
<p>Just a heads-up that starting next week all code reviews must be done
within 24 hours for Project Alpha. No action needed from you now.</p>
</body></html>
"""
_DATE_EMAIL = """\
<html><head></head><body>
<p><b>From:</b> pm@company.com</p>
<p><b>Subject:</b> Project Alpha kick-off meeting</p>
<p>The kick-off meeting for Project Alpha is scheduled for 2026-04-15 at 10:00.</p>
</body></html>
"""
_NO_PROJECT_EMAIL = """\
<html><head></head><body>
<p><b>From:</b> newsletter@ads.com</p>
<p><b>Subject:</b> Weekly newsletter</p>
<p>Check out our latest deals on electronics!</p>
</body></html>
"""
_EXISTING_TASK = {
"id": "task-existing",
"title": "Fix the login bug",
"status": "todo",
"priority": "medium",
} }
# ── Fixture loading ───────────────────────────────────────────────────────
def _fixtures_dir(config) -> Path:
override = config.getoption("--runner-dir")
return Path(override) if override else _DEFAULT_FIXTURE_DIR
def _load_cases(config) -> list[dict]:
return yaml.safe_load(
(_fixtures_dir(config) / "cases.yaml").read_text(encoding="utf-8")
)
def _read_case_file(case: dict, data_dir: Path) -> str:
return (data_dir / case["file"]).read_text(encoding="utf-8")
def _resolve_projects(entries: list[str | dict]) -> list[dict]:
"""Resolve project list from YAML: symbolic names and/or inline dicts."""
result = []
for entry in entries:
if isinstance(entry, str):
if entry in _PROJECTS:
result.append(_PROJECTS[entry])
elif isinstance(entry, dict):
result.append(entry)
return result
# ── pytest_generate_tests — parametrize eval tests from YAML ─────────────
def pytest_generate_tests(metafunc):
if "runner_case" not in metafunc.fixturenames:
return
cases = _load_cases(metafunc.config)
metafunc.parametrize("runner_case", cases, ids=[c["id"] for c in cases])
# ── Test helpers ────────────────────────────────────────────────────────── # ── Test helpers ──────────────────────────────────────────────────────────
@@ -175,7 +179,7 @@ def _make_executor(
directory listing, file reading, project/entity fetching, and CRUD. directory listing, file reading, project/entity fetching, and CRUD.
""" """
calls: list[dict] = [] calls: list[dict] = []
_projects = projects or [_PROJECT_ALPHA, _PROJECT_BETA] _projects = projects if projects is not None else list(_PROJECTS.values())
async def _executor(payload: dict) -> dict: async def _executor(payload: dict) -> dict:
action = payload.get("action", "") action = payload.get("action", "")
@@ -184,10 +188,7 @@ def _make_executor(
calls.append({"action": action, "table": table, "data": data}) calls.append({"action": action, "table": table, "data": data})
if action == "list_directory": if action == "list_directory":
path = data.get("path", "") or payload.get("data", {}).get("path", "") return {"entries": [{"type": "file", "path": file_path}]}
return {
"entries": [{"type": "file", "path": file_path}]
}
if action == "get_file_metadata": if action == "get_file_metadata":
return {"modifiedAt": None} return {"modifiedAt": None}
@@ -225,7 +226,7 @@ def test_format_projects_empty():
def test_format_projects_with_data(): def test_format_projects_with_data():
result = _format_projects([_PROJECT_ALPHA]) result = _format_projects([_PROJECTS["alpha"]])
assert "proj-alpha" in result assert "proj-alpha" in result
assert "Project Alpha" in result assert "Project Alpha" in result
@@ -253,7 +254,6 @@ def test_get_extraction_rules_fallback():
def test_get_no_match_behavior_from_global_rules(): def test_get_no_match_behavior_from_global_rules():
behavior = _get_no_match_behavior(_AGENT_CONFIG) behavior = _get_no_match_behavior(_AGENT_CONFIG)
# The global rule says "non creare alcuna entità" → skip behavior
assert behavior # non-empty assert behavior # non-empty
@@ -292,8 +292,8 @@ async def test_2_10_empty_file():
executor, calls = _make_executor( executor, calls = _make_executor(
file_path="/emails/empty.html", file_path="/emails/empty.html",
file_content="", # empty file_content="",
projects=[_PROJECT_ALPHA], projects=[_PROJECTS["alpha"]],
) )
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
@@ -318,11 +318,10 @@ async def test_2_8_items_created_count():
executor, _calls = _make_executor( executor, _calls = _make_executor(
file_path="/emails/action.html", file_path="/emails/action.html",
file_content=_ACTION_EMAIL, file_content="<html><body><p>Fix the login bug in Project Alpha.</p></body></html>",
projects=[_PROJECT_ALPHA], projects=[_PROJECTS["alpha"]],
) )
# Simulate LLM calling create_task twice and update_note once.
async def mock_run_agent(*, _tool_calls_out=None, **kw) -> str: async def mock_run_agent(*, _tool_calls_out=None, **kw) -> str:
if _tool_calls_out is not None: if _tool_calls_out is not None:
_tool_calls_out.extend(["create_task", "create_note", "update_task"]) _tool_calls_out.extend(["create_task", "create_note", "update_task"])
@@ -339,33 +338,43 @@ async def test_2_8_items_created_count():
assert kwargs["items_processed"] == 1 assert kwargs["items_processed"] == 1
# ── Eval: 2.12.7 (real LLM + Langfuse scoring) ────────────────────────── # ── Eval: 2.12.7 — fixture-driven, real LLM + Langfuse scoring ──────────
# #
# Langfuse V3 pattern: # Cases loaded from tests/fixtures/agent_runner_v2/cases.yaml.
# lf.start_as_current_observation(name=...) as context manager → obs object # Supported assertions (from YAML):
# obs.score(name=..., value=...) (not lf.score(trace_id=...)) # expect_insert: <table> → at least 1 insert in that table
# contextlib.nullcontext() when lf is None → obs is None, no-op # expect_no_insert: true → zero inserts in any table
# expect_project_id: <id> → any insert carries this projectId
# expect_dedup: true → task inserts == 0 OR task updates >= 1
# ───────────────────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────────────────
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.eval @pytest.mark.eval
async def test_2_1_email_to_task(): async def test_eval_runner(runner_case, pytestconfig):
"""2.1 Action email → LLM calls create_task. Score: runner.email_to_task.""" """Parametrized eval test — one invocation per YAML case."""
from contextlib import nullcontext case: dict = runner_case
lf = get_langfuse() data_dir = _fixtures_dir(pytestconfig) / "data"
file_content = _read_case_file(case, data_dir)
projects = _resolve_projects(case.get("projects", []))
config = _make_config() config = _make_config()
run_log = _make_run_log(config.id) run_log = _make_run_log(config.id)
mgr = _make_manager() mgr = _make_manager()
executor, calls = _make_executor( executor, calls = _make_executor(
file_path="/emails/ProjectAlpha_action.html", file_path=case["file_path"],
file_content=_ACTION_EMAIL, file_content=file_content,
projects=[_PROJECT_ALPHA, _PROJECT_BETA], projects=projects,
existing_tasks=case.get("existing_tasks"),
existing_notes=case.get("existing_notes"),
existing_timelines=case.get("existing_timelines"),
) )
lf = get_langfuse()
obs_ctx = lf.start_as_current_observation( obs_ctx = lf.start_as_current_observation(
name="eval-runner-2.1-email-to-task", metadata={"step": "2"} name=f"eval-runner-{case['id']}-{case.get('score_name', 'unknown').replace('.', '-')}",
metadata={"step": "2", "case_id": case["id"]},
) if lf else nullcontext() ) if lf else nullcontext()
with obs_ctx as obs: with obs_ctx as obs:
@@ -374,253 +383,50 @@ async def test_2_1_email_to_task():
await run_local_agent(_USER_ID, config, run_log, mgr) await run_local_agent(_USER_ID, config, run_log, mgr)
_, kwargs = mock_fin.call_args _, kwargs = mock_fin.call_args
task_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "tasks"] inserts = [c for c in calls if c["action"] == "insert"]
score = 1.0 if len(task_creates) >= 1 else 0.0 score, comment = _evaluate_case(case, calls, kwargs)
if obs is not None: if obs is not None:
obs.score( obs.score(
name="runner.email_to_task", name=case.get("score_name", f"runner.case_{case['id']}"),
value=score, value=score,
comment=f"task_creates={len(task_creates)} items_created={kwargs.get('items_created')}", comment=comment,
) )
if lf: if lf:
lf.flush() lf.flush()
assert score == 1.0, f"Expected at least 1 task created, got {len(task_creates)}" assert score == 1.0, f"[{case['id']}] {case.get('description', '')}{comment}"
@pytest.mark.asyncio def _evaluate_case(case: dict, calls: list[dict], finalize_kwargs: dict) -> tuple[float, str]:
@pytest.mark.eval """Return (score, comment) for a YAML case given the captured executor calls."""
async def test_2_2_email_to_note(): inserts = [c for c in calls if c["action"] == "insert"]
"""2.2 Informational email → LLM calls create_note. Score: runner.email_to_note."""
from contextlib import nullcontext
lf = get_langfuse()
config = _make_config() if case.get("expect_no_insert"):
run_log = _make_run_log(config.id)
mgr = _make_manager()
executor, calls = _make_executor(
file_path="/emails/ProjectAlpha_info.html",
file_content=_INFO_EMAIL,
projects=[_PROJECT_ALPHA, _PROJECT_BETA],
)
obs_ctx = lf.start_as_current_observation(
name="eval-runner-2.2-email-to-note", metadata={"step": "2"}
) if lf else nullcontext()
with obs_ctx as obs:
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
await run_local_agent(_USER_ID, config, run_log, mgr)
note_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "notes"]
score = 1.0 if len(note_creates) >= 1 else 0.0
if obs is not None:
obs.score(name="runner.email_to_note", value=score,
comment=f"note_creates={len(note_creates)}")
if lf:
lf.flush()
assert score == 1.0, f"Expected at least 1 note created, got {len(note_creates)}"
@pytest.mark.asyncio
@pytest.mark.eval
async def test_2_3_email_to_timeline():
"""2.3 Email with event date → LLM calls create_timeline. Score: runner.email_to_timeline."""
from contextlib import nullcontext
lf = get_langfuse()
config = _make_config()
run_log = _make_run_log(config.id)
mgr = _make_manager()
executor, calls = _make_executor(
file_path="/emails/ProjectAlpha_kickoff.html",
file_content=_DATE_EMAIL,
projects=[_PROJECT_ALPHA, _PROJECT_BETA],
)
obs_ctx = lf.start_as_current_observation(
name="eval-runner-2.3-email-to-timeline", metadata={"step": "2"}
) if lf else nullcontext()
with obs_ctx as obs:
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
await run_local_agent(_USER_ID, config, run_log, mgr)
tl_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "timelines"]
score = 1.0 if len(tl_creates) >= 1 else 0.0
if obs is not None:
obs.score(name="runner.email_to_timeline", value=score,
comment=f"timeline_creates={len(tl_creates)}")
if lf:
lf.flush()
assert score == 1.0, f"Expected at least 1 timeline created, got {len(tl_creates)}"
@pytest.mark.asyncio
@pytest.mark.eval
async def test_2_4_project_matching_filename():
"""2.4 Filename contains 'ProjectAlpha' → LLM assigns to proj-alpha. Score: runner.project_filename."""
from contextlib import nullcontext
lf = get_langfuse()
config = _make_config()
run_log = _make_run_log(config.id)
mgr = _make_manager()
executor, calls = _make_executor(
file_path="/emails/ProjectAlpha_report.html",
file_content=_ACTION_EMAIL,
projects=[_PROJECT_ALPHA, _PROJECT_BETA],
)
obs_ctx = lf.start_as_current_observation(
name="eval-runner-2.4-project-filename", metadata={"step": "2"}
) if lf else nullcontext()
with obs_ctx as obs:
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
await run_local_agent(_USER_ID, config, run_log, mgr)
inserts = [c for c in calls if c["action"] == "insert"]
correct_project = any(
c.get("data", {}).get("projectId") == "proj-alpha" for c in inserts
)
score = 1.0 if correct_project else 0.0
if obs is not None:
obs.score(name="runner.project_filename", value=score)
if lf:
lf.flush()
assert score == 1.0, "Expected inserts to use proj-alpha based on filename"
@pytest.mark.asyncio
@pytest.mark.eval
async def test_2_5_project_matching_content():
"""2.5 Email body mentions 'Project Alpha' → correct project assigned. Score: runner.project_content."""
from contextlib import nullcontext
lf = get_langfuse()
config = _make_config()
run_log = _make_run_log(config.id)
mgr = _make_manager()
executor, calls = _make_executor(
file_path="/emails/email_001.html", # generic filename, no project hint
file_content=_ACTION_EMAIL, # body mentions "Project Alpha"
projects=[_PROJECT_ALPHA, _PROJECT_BETA],
)
obs_ctx = lf.start_as_current_observation(
name="eval-runner-2.5-project-content", metadata={"step": "2"}
) if lf else nullcontext()
with obs_ctx as obs:
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
await run_local_agent(_USER_ID, config, run_log, mgr)
inserts = [c for c in calls if c["action"] == "insert"]
correct_project = any(
c.get("data", {}).get("projectId") == "proj-alpha" for c in inserts
)
score = 1.0 if correct_project else 0.0
if obs is not None:
obs.score(name="runner.project_content", value=score)
if lf:
lf.flush()
assert score == 1.0, "Expected inserts to use proj-alpha based on email body content"
@pytest.mark.asyncio
@pytest.mark.eval
async def test_2_6_no_project_match_global_rule():
"""2.6 Newsletter email + global rule 'no project = no entities' → no creates. Score: runner.no_project."""
from contextlib import nullcontext
lf = get_langfuse()
config = _make_config()
run_log = _make_run_log(config.id)
mgr = _make_manager()
executor, calls = _make_executor(
file_path="/emails/newsletter.html",
file_content=_NO_PROJECT_EMAIL,
projects=[_PROJECT_ALPHA, _PROJECT_BETA],
)
obs_ctx = lf.start_as_current_observation(
name="eval-runner-2.6-no-project", metadata={"step": "2"}
) if lf else nullcontext()
with obs_ctx as obs:
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
await run_local_agent(_USER_ID, config, run_log, mgr)
inserts = [c for c in calls if c["action"] == "insert"]
score = 1.0 if len(inserts) == 0 else 0.0 score = 1.0 if len(inserts) == 0 else 0.0
return score, f"inserts={len(inserts)} (expected 0)"
if obs is not None: if "expect_insert" in case:
obs.score(name="runner.no_project", value=score, tables = case["expect_insert"]
comment=f"inserts={len(inserts)}") if isinstance(tables, str):
tables = [tables]
missing = [t for t in tables if not any(c["table"] == t for c in inserts)]
score = 1.0 if not missing else 0.0
counts = {t: sum(1 for c in inserts if c["table"] == t) for t in tables}
return score, f"inserts={counts}" + (f" missing={missing}" if missing else "")
if lf: if "expect_project_id" in case:
lf.flush() expected_pid = case["expect_project_id"]
correct = any(c.get("data", {}).get("projectId") == expected_pid for c in inserts)
score = 1.0 if correct else 0.0
all_pids = [c.get("data", {}).get("projectId") for c in inserts]
return score, f"projectIds={all_pids} (expected {expected_pid!r})"
assert score == 1.0, f"Expected 0 inserts for unmatched newsletter, got {len(inserts)}" if case.get("expect_dedup"):
task_creates = [c for c in inserts if c["table"] == "tasks"]
task_updates = [c for c in calls if c["action"] == "update" and c["table"] == "tasks"]
@pytest.mark.asyncio
@pytest.mark.eval
async def test_2_7_deduplication():
"""2.7 Existing task with same title → LLM calls update_task, not create_task. Score: runner.dedup."""
from contextlib import nullcontext
lf = get_langfuse()
config = _make_config()
run_log = _make_run_log(config.id)
mgr = _make_manager()
executor, calls = _make_executor(
file_path="/emails/ProjectAlpha_followup.html",
file_content=_ACTION_EMAIL, # "Fix the login bug" — already exists
projects=[_PROJECT_ALPHA],
existing_tasks=[_EXISTING_TASK], # task already exists
)
obs_ctx = lf.start_as_current_observation(
name="eval-runner-2.7-dedup", metadata={"step": "2"}
) if lf else nullcontext()
with obs_ctx as obs:
with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
await run_local_agent(_USER_ID, config, run_log, mgr)
task_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "tasks"]
task_updates = [c for c in calls if c["action"] == "update" and c.get("table") == "tasks"]
score = 1.0 if len(task_creates) == 0 or len(task_updates) >= 1 else 0.0 score = 1.0 if len(task_creates) == 0 or len(task_updates) >= 1 else 0.0
return score, f"task_creates={len(task_creates)} task_updates={len(task_updates)}"
if obs is not None: return 0.0, "no assertion defined in case"
obs.score(name="runner.dedup", value=score,
comment=f"creates={len(task_creates)} updates={len(task_updates)}")
if lf:
lf.flush()
assert score == 1.0, (
f"Expected deduplication: creates={len(task_creates)}, updates={len(task_updates)}"
)