fix(langfuse): remove invalid user_id/session_id kwargs from start_as_current_observation

Langfuse V3 does not accept user_id/session_id on observation-level calls. Moved to metadata dict in agent_runner, deep_agent, and agent_setup. refactor(tests): fixture-based pattern for agent_runner_v2 eval tests - cases.yaml + data/ fixtures under tests/fixtures/agent_runner_v2/ - pytest_generate_tests parametrizes test_eval_runner from YAML - _resolve_projects() handles symbolic names and inline dicts - _evaluate_case() centralizes all assertion logic - --runner-dir CLI option for custom fixture folders Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-08 00:45:15 +02:00
parent d8add7e8cb
commit e672b58b6f
9 changed files with 235 additions and 321 deletions
--- a/app/api/routes/agent_setup.py
+++ b/app/api/routes/agent_setup.py
@@ -175,7 +175,7 @@ def _build_system_prompt(
        else ""
    )
    template, prompt_obj = get_prompt_or_fallback(
-        "journey_system_v2", _JOURNEY_SYSTEM_PROMPT
+        "journey_system", _JOURNEY_SYSTEM_PROMPT
    )
    compiled = compile_prompt(
        template,
--- a/app/core/agent_runner.py
+++ b/app/core/agent_runner.py
@@ -251,7 +251,7 @@ async def _run_agent_with_tools(
        lf.start_as_current_observation(
            as_type="span",
            name=agent_name,
-            user_id=user_id or None,
+            metadata={"user_id": user_id} if user_id else None,
            input=user_message,
        )
        if lf else None
--- a/app/core/deep_agent.py
+++ b/app/core/deep_agent.py
@@ -615,8 +615,7 @@ async def _run_single_agent(
        lf.start_as_current_observation(
            as_type="span",
            name=agent_name,
-            user_id=user_id,
+            metadata={"user_id": user_id, "session_id": trace_id},
            session_id=trace_id,
            input=message,
        )
        if lf else None
@@ -740,8 +739,7 @@ async def _run_single_agent_stream(
        lf.start_as_current_observation(
            as_type="span",
            name=f"{agent_name}-stream",
-            user_id=user_id,
+            metadata={"user_id": user_id, "session_id": trace_id},
            session_id=trace_id,
            input=message,
        )
        if lf else None
--- a/tests/fixtures/agent_runner_v2/cases.yaml
+++ b/tests/fixtures/agent_runner_v2/cases.yaml
@@ -0,0 +1,86 @@
 # Agent Runner V2 — eval test cases (Step 2, requires real LLM)
 #
 # Each case drives one parametrized `test_eval_runner` invocation.
 #
 # Keys
 # ----
 # id: str                     unique identifier shown in pytest output
 # description: str            human-readable label
 # file: str                   filename inside data/
 # file_path: str              path reported to the executor (affects project-matching via filename)
 # projects: [alpha|beta]      symbolic project names resolved by the test helper
 #
 # Optional pre-existing records (dedup tests)
 # existing_tasks:             list of {id, title, status, priority}
 # existing_notes:             list of {id, title, content}
 # existing_timelines:         list of {id, title, date}
 #
 # Assertions (one or more)
 # expect_insert: <table>      at least 1 insert row in this table (tasks|notes|timelines)
 # expect_no_insert: true      zero inserts in any table
 # expect_project_id: <id>     any insert must carry this projectId
 # expect_dedup: true          task inserts == 0 OR task updates >= 1 (dedup check)
 #
 # Langfuse
 # score_name: str             observation score name
 - id: "2.1"
  description: "Action email → create_task"
  file: email_action.html
  file_path: /emails/ProjectAlpha_action.html
  projects: [alpha, beta]
  expect_insert: tasks
  score_name: runner.email_to_task
 - id: "2.2"
  description: "Informational email → create_note"
  file: email_info.html
  file_path: /emails/ProjectAlpha_info.html
  projects: [alpha, beta]
  expect_insert: notes
  score_name: runner.email_to_note
 - id: "2.3"
  description: "Email with meeting date → create_timeline"
  file: email_date.html
  file_path: /emails/ProjectAlpha_kickoff.html
  projects: [alpha, beta]
  expect_insert: timelines
  score_name: runner.email_to_timeline
 - id: "2.4"
  description: "Filename contains project name → correct project assigned"
  file: email_action.html
  file_path: /emails/ProjectAlpha_report.html
  projects: [alpha, beta]
  expect_project_id: proj-alpha
  score_name: runner.project_filename
 - id: "2.5"
  description: "Email body mentions project → correct project assigned"
  file: email_action.html
  file_path: /emails/email_001.html
  projects: [alpha, beta]
  expect_project_id: proj-alpha
  score_name: runner.project_content
 - id: "2.6"
  description: "Newsletter + global rule no-project → no creates"
  file: email_no_project.html
  file_path: /emails/newsletter.html
  projects: [alpha, beta]
  expect_no_insert: true
  score_name: runner.no_project
 - id: "2.7"
  description: "Existing task with same title → dedup (update not create)"
  file: email_action.html
  file_path: /emails/ProjectAlpha_followup.html
  projects: [alpha]
  existing_tasks:
    - id: task-existing
      title: Fix the login bug
      status: todo
      priority: medium
  expect_dedup: true
  score_name: runner.dedup
--- a/tests/fixtures/agent_runner_v2/data/email_action.html
+++ b/tests/fixtures/agent_runner_v2/data/email_action.html
@@ -0,0 +1,7 @@
 <html><head></head><body>
 <p><b>From:</b> boss@company.com</p>
 <p><b>To:</b> dev@company.com</p>
 <p><b>Subject:</b> Fix the login bug</p>
 <p><b>Date:</b> 2026-04-07</p>
 <p>Hi,<br>Please fix the login bug in Project Alpha by Friday. High priority!</p>
 </body></html>
--- a/tests/fixtures/agent_runner_v2/data/email_date.html
+++ b/tests/fixtures/agent_runner_v2/data/email_date.html
@@ -0,0 +1,5 @@
 <html><head></head><body>
 <p><b>From:</b> pm@company.com</p>
 <p><b>Subject:</b> Project Alpha kick-off meeting</p>
 <p>The kick-off meeting for Project Alpha is scheduled for 2026-04-15 at 10:00.</p>
 </body></html>
--- a/tests/fixtures/agent_runner_v2/data/email_info.html
+++ b/tests/fixtures/agent_runner_v2/data/email_info.html
@@ -0,0 +1,7 @@
 <html><head></head><body>
 <p><b>From:</b> pm@company.com</p>
 <p><b>To:</b> team@company.com</p>
 <p><b>Subject:</b> FYI: New policy for Project Alpha</p>
 <p>Just a heads-up that starting next week all code reviews must be done
 within 24 hours for Project Alpha. No action needed from you now.</p>
 </body></html>
--- a/tests/fixtures/agent_runner_v2/data/email_no_project.html
+++ b/tests/fixtures/agent_runner_v2/data/email_no_project.html
@@ -0,0 +1,5 @@
 <html><head></head><body>
 <p><b>From:</b> newsletter@ads.com</p>
 <p><b>Subject:</b> Weekly newsletter</p>
 <p>Check out our latest deals on electronics!</p>
 </body></html>
--- a/tests/test_agent_runner_v2.py
+++ b/tests/test_agent_runner_v2.py
@@ -4,32 +4,36 @@ Covers the unified per-file flow:
  Phase A — detect + preprocess (Python, zero LLM)
  Phase B — single LLM call with tools (classify + extract + create)
-Test cases:
+Fixture-based eval tests (2.1–2.7)
-  2.1  Happy path: email with action    → create_task called
+-----------------------------------
-  2.2  Happy path: email informative    → create_note called
+Cases are defined in tests/fixtures/agent_runner_v2/cases.yaml.
-  2.3  Happy path: email with date      → create_timeline called
+Email HTML files live in tests/fixtures/agent_runner_v2/data/.
-  2.4  Project matching via filename    → correct project_id used
+Use --runner-dir to point at a custom folder (same structure required).
-  2.5  Project matching via content     → correct project_id used
+
-  2.6  No project match + global rule   → no create_* called
+Unit tests (no LLM)
-  2.7  Deduplication                    → update_task, not create_task
+--------------------
-  2.8  items_created count (unit)       → items_created == N create_* calls
+  2.8  items_created count   → items_created == N create_* calls
-  2.9  Device offline (unit)            → status=error
+  2.9  Device offline        → status=error
-  2.10 Empty file (unit)                → items_processed=0, status=success
+  2.10 Empty file            → items_processed=0, status=success
 Run:
    pytest tests/test_agent_runner_v2.py -v
    pytest tests/test_agent_runner_v2.py -v -k "2_9 or 2_10 or 2_8"   # unit only
    pytest tests/test_agent_runner_v2.py -v -k "eval"                  # LLM evals only
    pytest tests/test_agent_runner_v2.py -v --runner-dir /path/to/dir  # custom fixtures
 """
 from __future__ import annotations
 import uuid
 from contextlib import nullcontext
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any
 from unittest.mock import AsyncMock, MagicMock, patch
 import pytest
 import yaml
 from app.core.agent_runner import (
    _format_metadata,
@@ -40,7 +44,7 @@ from app.core.agent_runner import (
    run_local_agent,
 )
 from app.core.device_manager import DeviceConnectionManager
-from app.core.langfuse_client import get_langfuse, get_prompt_or_fallback
+from app.core.langfuse_client import get_langfuse
 from app.models import AgentRunLog, LocalAgentConfig
 from tests.conftest import TEST_USER_IDS
@@ -48,6 +52,8 @@ from tests.conftest import TEST_USER_IDS
 _USER_ID = TEST_USER_IDS["power"]
 _DEFAULT_FIXTURE_DIR = Path(__file__).parent / "fixtures" / "agent_runner_v2"
 _AGENT_CONFIG = {
    "content_types": [
        {
@@ -68,55 +74,53 @@ _AGENT_CONFIG = {
    "data_types": ["tasks", "notes", "timelines"],
 }
-_PROJECT_ALPHA = {"id": "proj-alpha", "name": "Project Alpha", "status": "active"}
+# Canonical project definitions, referenced symbolically in cases.yaml.
-_PROJECT_BETA  = {"id": "proj-beta",  "name": "Project Beta",  "status": "active"}
+_PROJECTS: dict[str, dict] = {
-
+    "alpha": {"id": "proj-alpha", "name": "Project Alpha", "status": "active"},
-# ── Sample email content ──────────────────────────────────────────────────
+    "beta":  {"id": "proj-beta",  "name": "Project Beta",  "status": "active"},
 _ACTION_EMAIL = """\
 <html><head></head><body>
 <p><b>From:</b> boss@company.com</p>
 <p><b>To:</b> dev@company.com</p>
 <p><b>Subject:</b> Fix the login bug</p>
 <p><b>Date:</b> 2026-04-07</p>
 <p>Hi,<br>Please fix the login bug in Project Alpha by Friday. High priority!</p>
 </body></html>
 """
 _INFO_EMAIL = """\
 <html><head></head><body>
 <p><b>From:</b> pm@company.com</p>
 <p><b>To:</b> team@company.com</p>
 <p><b>Subject:</b> FYI: New policy for Project Alpha</p>
 <p>Just a heads-up that starting next week all code reviews must be done
 within 24 hours for Project Alpha. No action needed from you now.</p>
 </body></html>
 """
 _DATE_EMAIL = """\
 <html><head></head><body>
 <p><b>From:</b> pm@company.com</p>
 <p><b>Subject:</b> Project Alpha kick-off meeting</p>
 <p>The kick-off meeting for Project Alpha is scheduled for 2026-04-15 at 10:00.</p>
 </body></html>
 """
 _NO_PROJECT_EMAIL = """\
 <html><head></head><body>
 <p><b>From:</b> newsletter@ads.com</p>
 <p><b>Subject:</b> Weekly newsletter</p>
 <p>Check out our latest deals on electronics!</p>
 </body></html>
 """
 _EXISTING_TASK = {
    "id": "task-existing",
    "title": "Fix the login bug",
    "status": "todo",
    "priority": "medium",
 }
 # ── Fixture loading ───────────────────────────────────────────────────────
 def _fixtures_dir(config) -> Path:
    override = config.getoption("--runner-dir")
    return Path(override) if override else _DEFAULT_FIXTURE_DIR
 def _load_cases(config) -> list[dict]:
    return yaml.safe_load(
        (_fixtures_dir(config) / "cases.yaml").read_text(encoding="utf-8")
    )
 def _read_case_file(case: dict, data_dir: Path) -> str:
    return (data_dir / case["file"]).read_text(encoding="utf-8")
 def _resolve_projects(entries: list[str | dict]) -> list[dict]:
    """Resolve project list from YAML: symbolic names and/or inline dicts."""
    result = []
    for entry in entries:
        if isinstance(entry, str):
            if entry in _PROJECTS:
                result.append(_PROJECTS[entry])
        elif isinstance(entry, dict):
            result.append(entry)
    return result
 # ── pytest_generate_tests — parametrize eval tests from YAML ─────────────
 def pytest_generate_tests(metafunc):
    if "runner_case" not in metafunc.fixturenames:
        return
    cases = _load_cases(metafunc.config)
    metafunc.parametrize("runner_case", cases, ids=[c["id"] for c in cases])
 # ── Test helpers ──────────────────────────────────────────────────────────
@@ -175,7 +179,7 @@ def _make_executor(
    directory listing, file reading, project/entity fetching, and CRUD.
    """
    calls: list[dict] = []
-    _projects = projects or [_PROJECT_ALPHA, _PROJECT_BETA]
+    _projects = projects if projects is not None else list(_PROJECTS.values())
    async def _executor(payload: dict) -> dict:
        action = payload.get("action", "")
@@ -184,10 +188,7 @@ def _make_executor(
        calls.append({"action": action, "table": table, "data": data})
        if action == "list_directory":
-            path = data.get("path", "") or payload.get("data", {}).get("path", "")
+            return {"entries": [{"type": "file", "path": file_path}]}
            return {
                "entries": [{"type": "file", "path": file_path}]
            }
        if action == "get_file_metadata":
            return {"modifiedAt": None}
@@ -225,7 +226,7 @@ def test_format_projects_empty():
 def test_format_projects_with_data():
-    result = _format_projects([_PROJECT_ALPHA])
+    result = _format_projects([_PROJECTS["alpha"]])
    assert "proj-alpha" in result
    assert "Project Alpha" in result
@@ -253,7 +254,6 @@ def test_get_extraction_rules_fallback():
 def test_get_no_match_behavior_from_global_rules():
    behavior = _get_no_match_behavior(_AGENT_CONFIG)
    # The global rule says "non creare alcuna entità" → skip behavior
    assert behavior  # non-empty
@@ -292,8 +292,8 @@ async def test_2_10_empty_file():
    executor, calls = _make_executor(
        file_path="/emails/empty.html",
-        file_content="",  # empty
+        file_content="",
-        projects=[_PROJECT_ALPHA],
+        projects=[_PROJECTS["alpha"]],
    )
    with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
@@ -318,11 +318,10 @@ async def test_2_8_items_created_count():
    executor, _calls = _make_executor(
        file_path="/emails/action.html",
-        file_content=_ACTION_EMAIL,
+        file_content="<html><body><p>Fix the login bug in Project Alpha.</p></body></html>",
-        projects=[_PROJECT_ALPHA],
+        projects=[_PROJECTS["alpha"]],
    )
    # Simulate LLM calling create_task twice and update_note once.
    async def mock_run_agent(*, _tool_calls_out=None, **kw) -> str:
        if _tool_calls_out is not None:
            _tool_calls_out.extend(["create_task", "create_note", "update_task"])
@@ -339,33 +338,43 @@ async def test_2_8_items_created_count():
    assert kwargs["items_processed"] == 1
-# ── Eval: 2.1–2.7 (real LLM + Langfuse scoring) ──────────────────────────
+# ── Eval: 2.1–2.7 — fixture-driven, real LLM + Langfuse scoring ──────────
 #
-# Langfuse V3 pattern:
+# Cases loaded from tests/fixtures/agent_runner_v2/cases.yaml.
-#   lf.start_as_current_observation(name=...) as context manager → obs object
+# Supported assertions (from YAML):
-#   obs.score(name=..., value=...)  (not lf.score(trace_id=...))
+#   expect_insert: <table>   → at least 1 insert in that table
-#   contextlib.nullcontext() when lf is None → obs is None, no-op
+#   expect_no_insert: true   → zero inserts in any table
 #   expect_project_id: <id>  → any insert carries this projectId
 #   expect_dedup: true       → task inserts == 0 OR task updates >= 1
 # ─────────────────────────────────────────────────────────────────────────
@pytest.mark.asyncio
@pytest.mark.eval
-async def test_2_1_email_to_task():
+async def test_eval_runner(runner_case, pytestconfig):
-    """2.1 Action email → LLM calls create_task. Score: runner.email_to_task."""
+    """Parametrized eval test — one invocation per YAML case."""
-    from contextlib import nullcontext
+    case: dict = runner_case
-    lf = get_langfuse()
+    data_dir = _fixtures_dir(pytestconfig) / "data"
    file_content = _read_case_file(case, data_dir)
    projects = _resolve_projects(case.get("projects", []))
    config = _make_config()
    run_log = _make_run_log(config.id)
    mgr = _make_manager()
    executor, calls = _make_executor(
-        file_path="/emails/ProjectAlpha_action.html",
+        file_path=case["file_path"],
-        file_content=_ACTION_EMAIL,
+        file_content=file_content,
-        projects=[_PROJECT_ALPHA, _PROJECT_BETA],
+        projects=projects,
        existing_tasks=case.get("existing_tasks"),
        existing_notes=case.get("existing_notes"),
        existing_timelines=case.get("existing_timelines"),
    )
    lf = get_langfuse()
    obs_ctx = lf.start_as_current_observation(
-        name="eval-runner-2.1-email-to-task", metadata={"step": "2"}
+        name=f"eval-runner-{case['id']}-{case.get('score_name', 'unknown').replace('.', '-')}",
        metadata={"step": "2", "case_id": case["id"]},
    ) if lf else nullcontext()
    with obs_ctx as obs:
@@ -374,253 +383,50 @@ async def test_2_1_email_to_task():
            await run_local_agent(_USER_ID, config, run_log, mgr)
        _, kwargs = mock_fin.call_args
-        task_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "tasks"]
+        inserts = [c for c in calls if c["action"] == "insert"]
-        score = 1.0 if len(task_creates) >= 1 else 0.0
+        score, comment = _evaluate_case(case, calls, kwargs)
        if obs is not None:
            obs.score(
-                name="runner.email_to_task",
+                name=case.get("score_name", f"runner.case_{case['id']}"),
                value=score,
-                comment=f"task_creates={len(task_creates)} items_created={kwargs.get('items_created')}",
+                comment=comment,
            )
    if lf:
        lf.flush()
-    assert score == 1.0, f"Expected at least 1 task created, got {len(task_creates)}"
+    assert score == 1.0, f"[{case['id']}] {case.get('description', '')} — {comment}"
-@pytest.mark.asyncio
+def _evaluate_case(case: dict, calls: list[dict], finalize_kwargs: dict) -> tuple[float, str]:
-@pytest.mark.eval
+    """Return (score, comment) for a YAML case given the captured executor calls."""
 async def test_2_2_email_to_note():
    """2.2 Informational email → LLM calls create_note. Score: runner.email_to_note."""
    from contextlib import nullcontext
    lf = get_langfuse()
    config = _make_config()
    run_log = _make_run_log(config.id)
    mgr = _make_manager()
    executor, calls = _make_executor(
        file_path="/emails/ProjectAlpha_info.html",
        file_content=_INFO_EMAIL,
        projects=[_PROJECT_ALPHA, _PROJECT_BETA],
    )
    obs_ctx = lf.start_as_current_observation(
        name="eval-runner-2.2-email-to-note", metadata={"step": "2"}
    ) if lf else nullcontext()
    with obs_ctx as obs:
        with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
             patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
            await run_local_agent(_USER_ID, config, run_log, mgr)
        note_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "notes"]
        score = 1.0 if len(note_creates) >= 1 else 0.0
        if obs is not None:
            obs.score(name="runner.email_to_note", value=score,
                      comment=f"note_creates={len(note_creates)}")
    if lf:
        lf.flush()
    assert score == 1.0, f"Expected at least 1 note created, got {len(note_creates)}"
@pytest.mark.asyncio
@pytest.mark.eval
 async def test_2_3_email_to_timeline():
    """2.3 Email with event date → LLM calls create_timeline. Score: runner.email_to_timeline."""
    from contextlib import nullcontext
    lf = get_langfuse()
    config = _make_config()
    run_log = _make_run_log(config.id)
    mgr = _make_manager()
    executor, calls = _make_executor(
        file_path="/emails/ProjectAlpha_kickoff.html",
        file_content=_DATE_EMAIL,
        projects=[_PROJECT_ALPHA, _PROJECT_BETA],
    )
    obs_ctx = lf.start_as_current_observation(
        name="eval-runner-2.3-email-to-timeline", metadata={"step": "2"}
    ) if lf else nullcontext()
    with obs_ctx as obs:
        with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
             patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
            await run_local_agent(_USER_ID, config, run_log, mgr)
        tl_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "timelines"]
        score = 1.0 if len(tl_creates) >= 1 else 0.0
        if obs is not None:
            obs.score(name="runner.email_to_timeline", value=score,
                      comment=f"timeline_creates={len(tl_creates)}")
    if lf:
        lf.flush()
    assert score == 1.0, f"Expected at least 1 timeline created, got {len(tl_creates)}"
@pytest.mark.asyncio
@pytest.mark.eval
 async def test_2_4_project_matching_filename():
    """2.4 Filename contains 'ProjectAlpha' → LLM assigns to proj-alpha. Score: runner.project_filename."""
    from contextlib import nullcontext
    lf = get_langfuse()
    config = _make_config()
    run_log = _make_run_log(config.id)
    mgr = _make_manager()
    executor, calls = _make_executor(
        file_path="/emails/ProjectAlpha_report.html",
        file_content=_ACTION_EMAIL,
        projects=[_PROJECT_ALPHA, _PROJECT_BETA],
    )
    obs_ctx = lf.start_as_current_observation(
        name="eval-runner-2.4-project-filename", metadata={"step": "2"}
    ) if lf else nullcontext()
    with obs_ctx as obs:
        with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
             patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
            await run_local_agent(_USER_ID, config, run_log, mgr)
    inserts = [c for c in calls if c["action"] == "insert"]
        correct_project = any(
            c.get("data", {}).get("projectId") == "proj-alpha" for c in inserts
        )
        score = 1.0 if correct_project else 0.0
-        if obs is not None:
+    if case.get("expect_no_insert"):
            obs.score(name="runner.project_filename", value=score)
    if lf:
        lf.flush()
    assert score == 1.0, "Expected inserts to use proj-alpha based on filename"
@pytest.mark.asyncio
@pytest.mark.eval
 async def test_2_5_project_matching_content():
    """2.5 Email body mentions 'Project Alpha' → correct project assigned. Score: runner.project_content."""
    from contextlib import nullcontext
    lf = get_langfuse()
    config = _make_config()
    run_log = _make_run_log(config.id)
    mgr = _make_manager()
    executor, calls = _make_executor(
        file_path="/emails/email_001.html",  # generic filename, no project hint
        file_content=_ACTION_EMAIL,          # body mentions "Project Alpha"
        projects=[_PROJECT_ALPHA, _PROJECT_BETA],
    )
    obs_ctx = lf.start_as_current_observation(
        name="eval-runner-2.5-project-content", metadata={"step": "2"}
    ) if lf else nullcontext()
    with obs_ctx as obs:
        with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
             patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
            await run_local_agent(_USER_ID, config, run_log, mgr)
        inserts = [c for c in calls if c["action"] == "insert"]
        correct_project = any(
            c.get("data", {}).get("projectId") == "proj-alpha" for c in inserts
        )
        score = 1.0 if correct_project else 0.0
        if obs is not None:
            obs.score(name="runner.project_content", value=score)
    if lf:
        lf.flush()
    assert score == 1.0, "Expected inserts to use proj-alpha based on email body content"
@pytest.mark.asyncio
@pytest.mark.eval
 async def test_2_6_no_project_match_global_rule():
    """2.6 Newsletter email + global rule 'no project = no entities' → no creates. Score: runner.no_project."""
    from contextlib import nullcontext
    lf = get_langfuse()
    config = _make_config()
    run_log = _make_run_log(config.id)
    mgr = _make_manager()
    executor, calls = _make_executor(
        file_path="/emails/newsletter.html",
        file_content=_NO_PROJECT_EMAIL,
        projects=[_PROJECT_ALPHA, _PROJECT_BETA],
    )
    obs_ctx = lf.start_as_current_observation(
        name="eval-runner-2.6-no-project", metadata={"step": "2"}
    ) if lf else nullcontext()
    with obs_ctx as obs:
        with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
             patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
            await run_local_agent(_USER_ID, config, run_log, mgr)
        inserts = [c for c in calls if c["action"] == "insert"]
        score = 1.0 if len(inserts) == 0 else 0.0
        return score, f"inserts={len(inserts)} (expected 0)"
-        if obs is not None:
+    if "expect_insert" in case:
-            obs.score(name="runner.no_project", value=score,
+        tables = case["expect_insert"]
-                      comment=f"inserts={len(inserts)}")
+        if isinstance(tables, str):
            tables = [tables]
        missing = [t for t in tables if not any(c["table"] == t for c in inserts)]
        score = 1.0 if not missing else 0.0
        counts = {t: sum(1 for c in inserts if c["table"] == t) for t in tables}
        return score, f"inserts={counts}" + (f" missing={missing}" if missing else "")
-    if lf:
+    if "expect_project_id" in case:
-        lf.flush()
+        expected_pid = case["expect_project_id"]
        correct = any(c.get("data", {}).get("projectId") == expected_pid for c in inserts)
        score = 1.0 if correct else 0.0
        all_pids = [c.get("data", {}).get("projectId") for c in inserts]
        return score, f"projectIds={all_pids} (expected {expected_pid!r})"
-    assert score == 1.0, f"Expected 0 inserts for unmatched newsletter, got {len(inserts)}"
+    if case.get("expect_dedup"):
-
+        task_creates = [c for c in inserts if c["table"] == "tasks"]
-
+        task_updates = [c for c in calls if c["action"] == "update" and c["table"] == "tasks"]
@pytest.mark.asyncio
@pytest.mark.eval
 async def test_2_7_deduplication():
    """2.7 Existing task with same title → LLM calls update_task, not create_task. Score: runner.dedup."""
    from contextlib import nullcontext
    lf = get_langfuse()
    config = _make_config()
    run_log = _make_run_log(config.id)
    mgr = _make_manager()
    executor, calls = _make_executor(
        file_path="/emails/ProjectAlpha_followup.html",
        file_content=_ACTION_EMAIL,       # "Fix the login bug" — already exists
        projects=[_PROJECT_ALPHA],
        existing_tasks=[_EXISTING_TASK],  # task already exists
    )
    obs_ctx = lf.start_as_current_observation(
        name="eval-runner-2.7-dedup", metadata={"step": "2"}
    ) if lf else nullcontext()
    with obs_ctx as obs:
        with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \
             patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock):
            await run_local_agent(_USER_ID, config, run_log, mgr)
        task_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "tasks"]
        task_updates = [c for c in calls if c["action"] == "update" and c.get("table") == "tasks"]
        score = 1.0 if len(task_creates) == 0 or len(task_updates) >= 1 else 0.0
        return score, f"task_creates={len(task_creates)} task_updates={len(task_updates)}"
-        if obs is not None:
+    return 0.0, "no assertion defined in case"
            obs.score(name="runner.dedup", value=score,
                      comment=f"creates={len(task_creates)} updates={len(task_updates)}")
    if lf:
        lf.flush()
    assert score == 1.0, (
        f"Expected deduplication: creates={len(task_creates)}, updates={len(task_updates)}"
    )