diff --git a/app/api/routes/agent_setup.py b/app/api/routes/agent_setup.py index 8545429..c1e063c 100644 --- a/app/api/routes/agent_setup.py +++ b/app/api/routes/agent_setup.py @@ -175,7 +175,7 @@ def _build_system_prompt( else "" ) template, prompt_obj = get_prompt_or_fallback( - "journey_system_v2", _JOURNEY_SYSTEM_PROMPT + "journey_system", _JOURNEY_SYSTEM_PROMPT ) compiled = compile_prompt( template, diff --git a/app/core/agent_runner.py b/app/core/agent_runner.py index f1d3e76..072bf7b 100644 --- a/app/core/agent_runner.py +++ b/app/core/agent_runner.py @@ -251,7 +251,7 @@ async def _run_agent_with_tools( lf.start_as_current_observation( as_type="span", name=agent_name, - user_id=user_id or None, + metadata={"user_id": user_id} if user_id else None, input=user_message, ) if lf else None diff --git a/app/core/deep_agent.py b/app/core/deep_agent.py index 0a011f2..38e85d3 100644 --- a/app/core/deep_agent.py +++ b/app/core/deep_agent.py @@ -615,8 +615,7 @@ async def _run_single_agent( lf.start_as_current_observation( as_type="span", name=agent_name, - user_id=user_id, - session_id=trace_id, + metadata={"user_id": user_id, "session_id": trace_id}, input=message, ) if lf else None @@ -740,8 +739,7 @@ async def _run_single_agent_stream( lf.start_as_current_observation( as_type="span", name=f"{agent_name}-stream", - user_id=user_id, - session_id=trace_id, + metadata={"user_id": user_id, "session_id": trace_id}, input=message, ) if lf else None diff --git a/tests/fixtures/agent_runner_v2/cases.yaml b/tests/fixtures/agent_runner_v2/cases.yaml new file mode 100644 index 0000000..e57f7b5 --- /dev/null +++ b/tests/fixtures/agent_runner_v2/cases.yaml @@ -0,0 +1,86 @@ +# Agent Runner V2 — eval test cases (Step 2, requires real LLM) +# +# Each case drives one parametrized `test_eval_runner` invocation. +# +# Keys +# ---- +# id: str unique identifier shown in pytest output +# description: str human-readable label +# file: str filename inside data/ +# file_path: str path reported to the executor (affects project-matching via filename) +# projects: [alpha|beta] symbolic project names resolved by the test helper +# +# Optional pre-existing records (dedup tests) +# existing_tasks: list of {id, title, status, priority} +# existing_notes: list of {id, title, content} +# existing_timelines: list of {id, title, date} +# +# Assertions (one or more) +# expect_insert: at least 1 insert row in this table (tasks|notes|timelines) +# expect_no_insert: true zero inserts in any table +# expect_project_id: any insert must carry this projectId +# expect_dedup: true task inserts == 0 OR task updates >= 1 (dedup check) +# +# Langfuse +# score_name: str observation score name + +- id: "2.1" + description: "Action email → create_task" + file: email_action.html + file_path: /emails/ProjectAlpha_action.html + projects: [alpha, beta] + expect_insert: tasks + score_name: runner.email_to_task + +- id: "2.2" + description: "Informational email → create_note" + file: email_info.html + file_path: /emails/ProjectAlpha_info.html + projects: [alpha, beta] + expect_insert: notes + score_name: runner.email_to_note + +- id: "2.3" + description: "Email with meeting date → create_timeline" + file: email_date.html + file_path: /emails/ProjectAlpha_kickoff.html + projects: [alpha, beta] + expect_insert: timelines + score_name: runner.email_to_timeline + +- id: "2.4" + description: "Filename contains project name → correct project assigned" + file: email_action.html + file_path: /emails/ProjectAlpha_report.html + projects: [alpha, beta] + expect_project_id: proj-alpha + score_name: runner.project_filename + +- id: "2.5" + description: "Email body mentions project → correct project assigned" + file: email_action.html + file_path: /emails/email_001.html + projects: [alpha, beta] + expect_project_id: proj-alpha + score_name: runner.project_content + +- id: "2.6" + description: "Newsletter + global rule no-project → no creates" + file: email_no_project.html + file_path: /emails/newsletter.html + projects: [alpha, beta] + expect_no_insert: true + score_name: runner.no_project + +- id: "2.7" + description: "Existing task with same title → dedup (update not create)" + file: email_action.html + file_path: /emails/ProjectAlpha_followup.html + projects: [alpha] + existing_tasks: + - id: task-existing + title: Fix the login bug + status: todo + priority: medium + expect_dedup: true + score_name: runner.dedup diff --git a/tests/fixtures/agent_runner_v2/data/email_action.html b/tests/fixtures/agent_runner_v2/data/email_action.html new file mode 100644 index 0000000..c95d2f2 --- /dev/null +++ b/tests/fixtures/agent_runner_v2/data/email_action.html @@ -0,0 +1,7 @@ + +

From: boss@company.com

+

To: dev@company.com

+

Subject: Fix the login bug

+

Date: 2026-04-07

+

Hi,
Please fix the login bug in Project Alpha by Friday. High priority!

+ diff --git a/tests/fixtures/agent_runner_v2/data/email_date.html b/tests/fixtures/agent_runner_v2/data/email_date.html new file mode 100644 index 0000000..000b915 --- /dev/null +++ b/tests/fixtures/agent_runner_v2/data/email_date.html @@ -0,0 +1,5 @@ + +

From: pm@company.com

+

Subject: Project Alpha kick-off meeting

+

The kick-off meeting for Project Alpha is scheduled for 2026-04-15 at 10:00.

+ diff --git a/tests/fixtures/agent_runner_v2/data/email_info.html b/tests/fixtures/agent_runner_v2/data/email_info.html new file mode 100644 index 0000000..01a33c8 --- /dev/null +++ b/tests/fixtures/agent_runner_v2/data/email_info.html @@ -0,0 +1,7 @@ + +

From: pm@company.com

+

To: team@company.com

+

Subject: FYI: New policy for Project Alpha

+

Just a heads-up that starting next week all code reviews must be done +within 24 hours for Project Alpha. No action needed from you now.

+ diff --git a/tests/fixtures/agent_runner_v2/data/email_no_project.html b/tests/fixtures/agent_runner_v2/data/email_no_project.html new file mode 100644 index 0000000..a76ea8f --- /dev/null +++ b/tests/fixtures/agent_runner_v2/data/email_no_project.html @@ -0,0 +1,5 @@ + +

From: newsletter@ads.com

+

Subject: Weekly newsletter

+

Check out our latest deals on electronics!

+ diff --git a/tests/test_agent_runner_v2.py b/tests/test_agent_runner_v2.py index e7bf517..ca51663 100644 --- a/tests/test_agent_runner_v2.py +++ b/tests/test_agent_runner_v2.py @@ -4,32 +4,36 @@ Covers the unified per-file flow: Phase A — detect + preprocess (Python, zero LLM) Phase B — single LLM call with tools (classify + extract + create) -Test cases: - 2.1 Happy path: email with action → create_task called - 2.2 Happy path: email informative → create_note called - 2.3 Happy path: email with date → create_timeline called - 2.4 Project matching via filename → correct project_id used - 2.5 Project matching via content → correct project_id used - 2.6 No project match + global rule → no create_* called - 2.7 Deduplication → update_task, not create_task - 2.8 items_created count (unit) → items_created == N create_* calls - 2.9 Device offline (unit) → status=error - 2.10 Empty file (unit) → items_processed=0, status=success +Fixture-based eval tests (2.1–2.7) +----------------------------------- +Cases are defined in tests/fixtures/agent_runner_v2/cases.yaml. +Email HTML files live in tests/fixtures/agent_runner_v2/data/. +Use --runner-dir to point at a custom folder (same structure required). + +Unit tests (no LLM) +-------------------- + 2.8 items_created count → items_created == N create_* calls + 2.9 Device offline → status=error + 2.10 Empty file → items_processed=0, status=success Run: pytest tests/test_agent_runner_v2.py -v pytest tests/test_agent_runner_v2.py -v -k "2_9 or 2_10 or 2_8" # unit only pytest tests/test_agent_runner_v2.py -v -k "eval" # LLM evals only + pytest tests/test_agent_runner_v2.py -v --runner-dir /path/to/dir # custom fixtures """ from __future__ import annotations import uuid +from contextlib import nullcontext from datetime import datetime, timezone +from pathlib import Path from typing import Any from unittest.mock import AsyncMock, MagicMock, patch import pytest +import yaml from app.core.agent_runner import ( _format_metadata, @@ -40,7 +44,7 @@ from app.core.agent_runner import ( run_local_agent, ) from app.core.device_manager import DeviceConnectionManager -from app.core.langfuse_client import get_langfuse, get_prompt_or_fallback +from app.core.langfuse_client import get_langfuse from app.models import AgentRunLog, LocalAgentConfig from tests.conftest import TEST_USER_IDS @@ -48,6 +52,8 @@ from tests.conftest import TEST_USER_IDS _USER_ID = TEST_USER_IDS["power"] +_DEFAULT_FIXTURE_DIR = Path(__file__).parent / "fixtures" / "agent_runner_v2" + _AGENT_CONFIG = { "content_types": [ { @@ -68,55 +74,53 @@ _AGENT_CONFIG = { "data_types": ["tasks", "notes", "timelines"], } -_PROJECT_ALPHA = {"id": "proj-alpha", "name": "Project Alpha", "status": "active"} -_PROJECT_BETA = {"id": "proj-beta", "name": "Project Beta", "status": "active"} - -# ── Sample email content ────────────────────────────────────────────────── - -_ACTION_EMAIL = """\ - -

From: boss@company.com

-

To: dev@company.com

-

Subject: Fix the login bug

-

Date: 2026-04-07

-

Hi,
Please fix the login bug in Project Alpha by Friday. High priority!

- -""" - -_INFO_EMAIL = """\ - -

From: pm@company.com

-

To: team@company.com

-

Subject: FYI: New policy for Project Alpha

-

Just a heads-up that starting next week all code reviews must be done -within 24 hours for Project Alpha. No action needed from you now.

- -""" - -_DATE_EMAIL = """\ - -

From: pm@company.com

-

Subject: Project Alpha kick-off meeting

-

The kick-off meeting for Project Alpha is scheduled for 2026-04-15 at 10:00.

- -""" - -_NO_PROJECT_EMAIL = """\ - -

From: newsletter@ads.com

-

Subject: Weekly newsletter

-

Check out our latest deals on electronics!

- -""" - -_EXISTING_TASK = { - "id": "task-existing", - "title": "Fix the login bug", - "status": "todo", - "priority": "medium", +# Canonical project definitions, referenced symbolically in cases.yaml. +_PROJECTS: dict[str, dict] = { + "alpha": {"id": "proj-alpha", "name": "Project Alpha", "status": "active"}, + "beta": {"id": "proj-beta", "name": "Project Beta", "status": "active"}, } +# ── Fixture loading ─────────────────────────────────────────────────────── + + +def _fixtures_dir(config) -> Path: + override = config.getoption("--runner-dir") + return Path(override) if override else _DEFAULT_FIXTURE_DIR + + +def _load_cases(config) -> list[dict]: + return yaml.safe_load( + (_fixtures_dir(config) / "cases.yaml").read_text(encoding="utf-8") + ) + + +def _read_case_file(case: dict, data_dir: Path) -> str: + return (data_dir / case["file"]).read_text(encoding="utf-8") + + +def _resolve_projects(entries: list[str | dict]) -> list[dict]: + """Resolve project list from YAML: symbolic names and/or inline dicts.""" + result = [] + for entry in entries: + if isinstance(entry, str): + if entry in _PROJECTS: + result.append(_PROJECTS[entry]) + elif isinstance(entry, dict): + result.append(entry) + return result + + +# ── pytest_generate_tests — parametrize eval tests from YAML ───────────── + + +def pytest_generate_tests(metafunc): + if "runner_case" not in metafunc.fixturenames: + return + cases = _load_cases(metafunc.config) + metafunc.parametrize("runner_case", cases, ids=[c["id"] for c in cases]) + + # ── Test helpers ────────────────────────────────────────────────────────── @@ -175,7 +179,7 @@ def _make_executor( directory listing, file reading, project/entity fetching, and CRUD. """ calls: list[dict] = [] - _projects = projects or [_PROJECT_ALPHA, _PROJECT_BETA] + _projects = projects if projects is not None else list(_PROJECTS.values()) async def _executor(payload: dict) -> dict: action = payload.get("action", "") @@ -184,10 +188,7 @@ def _make_executor( calls.append({"action": action, "table": table, "data": data}) if action == "list_directory": - path = data.get("path", "") or payload.get("data", {}).get("path", "") - return { - "entries": [{"type": "file", "path": file_path}] - } + return {"entries": [{"type": "file", "path": file_path}]} if action == "get_file_metadata": return {"modifiedAt": None} @@ -225,7 +226,7 @@ def test_format_projects_empty(): def test_format_projects_with_data(): - result = _format_projects([_PROJECT_ALPHA]) + result = _format_projects([_PROJECTS["alpha"]]) assert "proj-alpha" in result assert "Project Alpha" in result @@ -253,7 +254,6 @@ def test_get_extraction_rules_fallback(): def test_get_no_match_behavior_from_global_rules(): behavior = _get_no_match_behavior(_AGENT_CONFIG) - # The global rule says "non creare alcuna entità" → skip behavior assert behavior # non-empty @@ -292,8 +292,8 @@ async def test_2_10_empty_file(): executor, calls = _make_executor( file_path="/emails/empty.html", - file_content="", # empty - projects=[_PROJECT_ALPHA], + file_content="", + projects=[_PROJECTS["alpha"]], ) with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ @@ -318,11 +318,10 @@ async def test_2_8_items_created_count(): executor, _calls = _make_executor( file_path="/emails/action.html", - file_content=_ACTION_EMAIL, - projects=[_PROJECT_ALPHA], + file_content="

Fix the login bug in Project Alpha.

", + projects=[_PROJECTS["alpha"]], ) - # Simulate LLM calling create_task twice and update_note once. async def mock_run_agent(*, _tool_calls_out=None, **kw) -> str: if _tool_calls_out is not None: _tool_calls_out.extend(["create_task", "create_note", "update_task"]) @@ -339,33 +338,43 @@ async def test_2_8_items_created_count(): assert kwargs["items_processed"] == 1 -# ── Eval: 2.1–2.7 (real LLM + Langfuse scoring) ────────────────────────── +# ── Eval: 2.1–2.7 — fixture-driven, real LLM + Langfuse scoring ────────── # -# Langfuse V3 pattern: -# lf.start_as_current_observation(name=...) as context manager → obs object -# obs.score(name=..., value=...) (not lf.score(trace_id=...)) -# contextlib.nullcontext() when lf is None → obs is None, no-op +# Cases loaded from tests/fixtures/agent_runner_v2/cases.yaml. +# Supported assertions (from YAML): +# expect_insert:
→ at least 1 insert in that table +# expect_no_insert: true → zero inserts in any table +# expect_project_id: → any insert carries this projectId +# expect_dedup: true → task inserts == 0 OR task updates >= 1 # ───────────────────────────────────────────────────────────────────────── @pytest.mark.asyncio @pytest.mark.eval -async def test_2_1_email_to_task(): - """2.1 Action email → LLM calls create_task. Score: runner.email_to_task.""" - from contextlib import nullcontext - lf = get_langfuse() +async def test_eval_runner(runner_case, pytestconfig): + """Parametrized eval test — one invocation per YAML case.""" + case: dict = runner_case + data_dir = _fixtures_dir(pytestconfig) / "data" + file_content = _read_case_file(case, data_dir) + projects = _resolve_projects(case.get("projects", [])) config = _make_config() run_log = _make_run_log(config.id) mgr = _make_manager() + executor, calls = _make_executor( - file_path="/emails/ProjectAlpha_action.html", - file_content=_ACTION_EMAIL, - projects=[_PROJECT_ALPHA, _PROJECT_BETA], + file_path=case["file_path"], + file_content=file_content, + projects=projects, + existing_tasks=case.get("existing_tasks"), + existing_notes=case.get("existing_notes"), + existing_timelines=case.get("existing_timelines"), ) + lf = get_langfuse() obs_ctx = lf.start_as_current_observation( - name="eval-runner-2.1-email-to-task", metadata={"step": "2"} + name=f"eval-runner-{case['id']}-{case.get('score_name', 'unknown').replace('.', '-')}", + metadata={"step": "2", "case_id": case["id"]}, ) if lf else nullcontext() with obs_ctx as obs: @@ -374,253 +383,50 @@ async def test_2_1_email_to_task(): await run_local_agent(_USER_ID, config, run_log, mgr) _, kwargs = mock_fin.call_args - task_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "tasks"] - score = 1.0 if len(task_creates) >= 1 else 0.0 + inserts = [c for c in calls if c["action"] == "insert"] + score, comment = _evaluate_case(case, calls, kwargs) if obs is not None: obs.score( - name="runner.email_to_task", + name=case.get("score_name", f"runner.case_{case['id']}"), value=score, - comment=f"task_creates={len(task_creates)} items_created={kwargs.get('items_created')}", + comment=comment, ) if lf: lf.flush() - assert score == 1.0, f"Expected at least 1 task created, got {len(task_creates)}" + assert score == 1.0, f"[{case['id']}] {case.get('description', '')} — {comment}" -@pytest.mark.asyncio -@pytest.mark.eval -async def test_2_2_email_to_note(): - """2.2 Informational email → LLM calls create_note. Score: runner.email_to_note.""" - from contextlib import nullcontext - lf = get_langfuse() +def _evaluate_case(case: dict, calls: list[dict], finalize_kwargs: dict) -> tuple[float, str]: + """Return (score, comment) for a YAML case given the captured executor calls.""" + inserts = [c for c in calls if c["action"] == "insert"] - config = _make_config() - run_log = _make_run_log(config.id) - mgr = _make_manager() - executor, calls = _make_executor( - file_path="/emails/ProjectAlpha_info.html", - file_content=_INFO_EMAIL, - projects=[_PROJECT_ALPHA, _PROJECT_BETA], - ) - - obs_ctx = lf.start_as_current_observation( - name="eval-runner-2.2-email-to-note", metadata={"step": "2"} - ) if lf else nullcontext() - - with obs_ctx as obs: - with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ - patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock): - await run_local_agent(_USER_ID, config, run_log, mgr) - - note_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "notes"] - score = 1.0 if len(note_creates) >= 1 else 0.0 - - if obs is not None: - obs.score(name="runner.email_to_note", value=score, - comment=f"note_creates={len(note_creates)}") - - if lf: - lf.flush() - - assert score == 1.0, f"Expected at least 1 note created, got {len(note_creates)}" - - -@pytest.mark.asyncio -@pytest.mark.eval -async def test_2_3_email_to_timeline(): - """2.3 Email with event date → LLM calls create_timeline. Score: runner.email_to_timeline.""" - from contextlib import nullcontext - lf = get_langfuse() - - config = _make_config() - run_log = _make_run_log(config.id) - mgr = _make_manager() - executor, calls = _make_executor( - file_path="/emails/ProjectAlpha_kickoff.html", - file_content=_DATE_EMAIL, - projects=[_PROJECT_ALPHA, _PROJECT_BETA], - ) - - obs_ctx = lf.start_as_current_observation( - name="eval-runner-2.3-email-to-timeline", metadata={"step": "2"} - ) if lf else nullcontext() - - with obs_ctx as obs: - with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ - patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock): - await run_local_agent(_USER_ID, config, run_log, mgr) - - tl_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "timelines"] - score = 1.0 if len(tl_creates) >= 1 else 0.0 - - if obs is not None: - obs.score(name="runner.email_to_timeline", value=score, - comment=f"timeline_creates={len(tl_creates)}") - - if lf: - lf.flush() - - assert score == 1.0, f"Expected at least 1 timeline created, got {len(tl_creates)}" - - -@pytest.mark.asyncio -@pytest.mark.eval -async def test_2_4_project_matching_filename(): - """2.4 Filename contains 'ProjectAlpha' → LLM assigns to proj-alpha. Score: runner.project_filename.""" - from contextlib import nullcontext - lf = get_langfuse() - - config = _make_config() - run_log = _make_run_log(config.id) - mgr = _make_manager() - executor, calls = _make_executor( - file_path="/emails/ProjectAlpha_report.html", - file_content=_ACTION_EMAIL, - projects=[_PROJECT_ALPHA, _PROJECT_BETA], - ) - - obs_ctx = lf.start_as_current_observation( - name="eval-runner-2.4-project-filename", metadata={"step": "2"} - ) if lf else nullcontext() - - with obs_ctx as obs: - with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ - patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock): - await run_local_agent(_USER_ID, config, run_log, mgr) - - inserts = [c for c in calls if c["action"] == "insert"] - correct_project = any( - c.get("data", {}).get("projectId") == "proj-alpha" for c in inserts - ) - score = 1.0 if correct_project else 0.0 - - if obs is not None: - obs.score(name="runner.project_filename", value=score) - - if lf: - lf.flush() - - assert score == 1.0, "Expected inserts to use proj-alpha based on filename" - - -@pytest.mark.asyncio -@pytest.mark.eval -async def test_2_5_project_matching_content(): - """2.5 Email body mentions 'Project Alpha' → correct project assigned. Score: runner.project_content.""" - from contextlib import nullcontext - lf = get_langfuse() - - config = _make_config() - run_log = _make_run_log(config.id) - mgr = _make_manager() - executor, calls = _make_executor( - file_path="/emails/email_001.html", # generic filename, no project hint - file_content=_ACTION_EMAIL, # body mentions "Project Alpha" - projects=[_PROJECT_ALPHA, _PROJECT_BETA], - ) - - obs_ctx = lf.start_as_current_observation( - name="eval-runner-2.5-project-content", metadata={"step": "2"} - ) if lf else nullcontext() - - with obs_ctx as obs: - with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ - patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock): - await run_local_agent(_USER_ID, config, run_log, mgr) - - inserts = [c for c in calls if c["action"] == "insert"] - correct_project = any( - c.get("data", {}).get("projectId") == "proj-alpha" for c in inserts - ) - score = 1.0 if correct_project else 0.0 - - if obs is not None: - obs.score(name="runner.project_content", value=score) - - if lf: - lf.flush() - - assert score == 1.0, "Expected inserts to use proj-alpha based on email body content" - - -@pytest.mark.asyncio -@pytest.mark.eval -async def test_2_6_no_project_match_global_rule(): - """2.6 Newsletter email + global rule 'no project = no entities' → no creates. Score: runner.no_project.""" - from contextlib import nullcontext - lf = get_langfuse() - - config = _make_config() - run_log = _make_run_log(config.id) - mgr = _make_manager() - executor, calls = _make_executor( - file_path="/emails/newsletter.html", - file_content=_NO_PROJECT_EMAIL, - projects=[_PROJECT_ALPHA, _PROJECT_BETA], - ) - - obs_ctx = lf.start_as_current_observation( - name="eval-runner-2.6-no-project", metadata={"step": "2"} - ) if lf else nullcontext() - - with obs_ctx as obs: - with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ - patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock): - await run_local_agent(_USER_ID, config, run_log, mgr) - - inserts = [c for c in calls if c["action"] == "insert"] + if case.get("expect_no_insert"): score = 1.0 if len(inserts) == 0 else 0.0 + return score, f"inserts={len(inserts)} (expected 0)" - if obs is not None: - obs.score(name="runner.no_project", value=score, - comment=f"inserts={len(inserts)}") + if "expect_insert" in case: + tables = case["expect_insert"] + if isinstance(tables, str): + tables = [tables] + missing = [t for t in tables if not any(c["table"] == t for c in inserts)] + score = 1.0 if not missing else 0.0 + counts = {t: sum(1 for c in inserts if c["table"] == t) for t in tables} + return score, f"inserts={counts}" + (f" missing={missing}" if missing else "") - if lf: - lf.flush() + if "expect_project_id" in case: + expected_pid = case["expect_project_id"] + correct = any(c.get("data", {}).get("projectId") == expected_pid for c in inserts) + score = 1.0 if correct else 0.0 + all_pids = [c.get("data", {}).get("projectId") for c in inserts] + return score, f"projectIds={all_pids} (expected {expected_pid!r})" - assert score == 1.0, f"Expected 0 inserts for unmatched newsletter, got {len(inserts)}" - - -@pytest.mark.asyncio -@pytest.mark.eval -async def test_2_7_deduplication(): - """2.7 Existing task with same title → LLM calls update_task, not create_task. Score: runner.dedup.""" - from contextlib import nullcontext - lf = get_langfuse() - - config = _make_config() - run_log = _make_run_log(config.id) - mgr = _make_manager() - executor, calls = _make_executor( - file_path="/emails/ProjectAlpha_followup.html", - file_content=_ACTION_EMAIL, # "Fix the login bug" — already exists - projects=[_PROJECT_ALPHA], - existing_tasks=[_EXISTING_TASK], # task already exists - ) - - obs_ctx = lf.start_as_current_observation( - name="eval-runner-2.7-dedup", metadata={"step": "2"} - ) if lf else nullcontext() - - with obs_ctx as obs: - with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ - patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock): - await run_local_agent(_USER_ID, config, run_log, mgr) - - task_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "tasks"] - task_updates = [c for c in calls if c["action"] == "update" and c.get("table") == "tasks"] + if case.get("expect_dedup"): + task_creates = [c for c in inserts if c["table"] == "tasks"] + task_updates = [c for c in calls if c["action"] == "update" and c["table"] == "tasks"] score = 1.0 if len(task_creates) == 0 or len(task_updates) >= 1 else 0.0 + return score, f"task_creates={len(task_creates)} task_updates={len(task_updates)}" - if obs is not None: - obs.score(name="runner.dedup", value=score, - comment=f"creates={len(task_creates)} updates={len(task_updates)}") - - if lf: - lf.flush() - - assert score == 1.0, ( - f"Expected deduplication: creates={len(task_creates)}, updates={len(task_updates)}" - ) + return 0.0, "no assertion defined in case"