From c6c4578f9a3a82b1685c5cc4ac209980411d9faf Mon Sep 17 00:00:00 2001 From: Roberto Musso Date: Tue, 7 Apr 2026 23:04:24 +0200 Subject: [PATCH] fix(tests): migrate eval tests to Langfuse V3 API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit lf.trace() and lf.score(trace_id=...) are V2 API removed in V3. V3 pattern: lf.start_as_current_observation(name=...) as context manager → obs obs.score(name=..., value=...) contextlib.nullcontext() when lf is None so structure stays the same Updated tests 2.1–2.7 in test_agent_runner_v2.py accordingly. Co-Authored-By: Claude Sonnet 4.6 --- tests/test_agent_runner_v2.py | 219 ++++++++++++++++++++-------------- 1 file changed, 129 insertions(+), 90 deletions(-) diff --git a/tests/test_agent_runner_v2.py b/tests/test_agent_runner_v2.py index fae88d9..e7bf517 100644 --- a/tests/test_agent_runner_v2.py +++ b/tests/test_agent_runner_v2.py @@ -340,43 +340,51 @@ async def test_2_8_items_created_count(): # ── Eval: 2.1–2.7 (real LLM + Langfuse scoring) ────────────────────────── +# +# Langfuse V3 pattern: +# lf.start_as_current_observation(name=...) as context manager → obs object +# obs.score(name=..., value=...) (not lf.score(trace_id=...)) +# contextlib.nullcontext() when lf is None → obs is None, no-op +# ───────────────────────────────────────────────────────────────────────── @pytest.mark.asyncio @pytest.mark.eval async def test_2_1_email_to_task(): """2.1 Action email → LLM calls create_task. Score: runner.email_to_task.""" + from contextlib import nullcontext lf = get_langfuse() - trace = lf.trace( - name="eval-runner-2.1-email-to-task", - metadata={"step": "2"}, - ) if lf else None config = _make_config() run_log = _make_run_log(config.id) mgr = _make_manager() - executor, calls = _make_executor( file_path="/emails/ProjectAlpha_action.html", file_content=_ACTION_EMAIL, projects=[_PROJECT_ALPHA, _PROJECT_BETA], ) - with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ - patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock) as mock_fin: - await run_local_agent(_USER_ID, config, run_log, mgr) + obs_ctx = lf.start_as_current_observation( + name="eval-runner-2.1-email-to-task", metadata={"step": "2"} + ) if lf else nullcontext() - _, kwargs = mock_fin.call_args - task_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "tasks"] - score = 1.0 if len(task_creates) >= 1 else 0.0 + with obs_ctx as obs: + with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ + patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock) as mock_fin: + await run_local_agent(_USER_ID, config, run_log, mgr) - if lf and trace: - lf.score( - trace_id=trace.id, - name="runner.email_to_task", - value=score, - comment=f"task_creates={len(task_creates)} items_created={kwargs.get('items_created')}", - ) + _, kwargs = mock_fin.call_args + task_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "tasks"] + score = 1.0 if len(task_creates) >= 1 else 0.0 + + if obs is not None: + obs.score( + name="runner.email_to_task", + value=score, + comment=f"task_creates={len(task_creates)} items_created={kwargs.get('items_created')}", + ) + + if lf: lf.flush() assert score == 1.0, f"Expected at least 1 task created, got {len(task_creates)}" @@ -386,29 +394,35 @@ async def test_2_1_email_to_task(): @pytest.mark.eval async def test_2_2_email_to_note(): """2.2 Informational email → LLM calls create_note. Score: runner.email_to_note.""" + from contextlib import nullcontext lf = get_langfuse() - trace = lf.trace(name="eval-runner-2.2-email-to-note", metadata={"step": "2"}) if lf else None config = _make_config() run_log = _make_run_log(config.id) mgr = _make_manager() - executor, calls = _make_executor( file_path="/emails/ProjectAlpha_info.html", file_content=_INFO_EMAIL, projects=[_PROJECT_ALPHA, _PROJECT_BETA], ) - with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ - patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock): - await run_local_agent(_USER_ID, config, run_log, mgr) + obs_ctx = lf.start_as_current_observation( + name="eval-runner-2.2-email-to-note", metadata={"step": "2"} + ) if lf else nullcontext() - note_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "notes"] - score = 1.0 if len(note_creates) >= 1 else 0.0 + with obs_ctx as obs: + with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ + patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock): + await run_local_agent(_USER_ID, config, run_log, mgr) - if lf and trace: - lf.score(trace_id=trace.id, name="runner.email_to_note", value=score, - comment=f"note_creates={len(note_creates)}") + note_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "notes"] + score = 1.0 if len(note_creates) >= 1 else 0.0 + + if obs is not None: + obs.score(name="runner.email_to_note", value=score, + comment=f"note_creates={len(note_creates)}") + + if lf: lf.flush() assert score == 1.0, f"Expected at least 1 note created, got {len(note_creates)}" @@ -418,29 +432,35 @@ async def test_2_2_email_to_note(): @pytest.mark.eval async def test_2_3_email_to_timeline(): """2.3 Email with event date → LLM calls create_timeline. Score: runner.email_to_timeline.""" + from contextlib import nullcontext lf = get_langfuse() - trace = lf.trace(name="eval-runner-2.3-email-to-timeline", metadata={"step": "2"}) if lf else None config = _make_config() run_log = _make_run_log(config.id) mgr = _make_manager() - executor, calls = _make_executor( file_path="/emails/ProjectAlpha_kickoff.html", file_content=_DATE_EMAIL, projects=[_PROJECT_ALPHA, _PROJECT_BETA], ) - with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ - patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock): - await run_local_agent(_USER_ID, config, run_log, mgr) + obs_ctx = lf.start_as_current_observation( + name="eval-runner-2.3-email-to-timeline", metadata={"step": "2"} + ) if lf else nullcontext() - tl_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "timelines"] - score = 1.0 if len(tl_creates) >= 1 else 0.0 + with obs_ctx as obs: + with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ + patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock): + await run_local_agent(_USER_ID, config, run_log, mgr) - if lf and trace: - lf.score(trace_id=trace.id, name="runner.email_to_timeline", value=score, - comment=f"timeline_creates={len(tl_creates)}") + tl_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "timelines"] + score = 1.0 if len(tl_creates) >= 1 else 0.0 + + if obs is not None: + obs.score(name="runner.email_to_timeline", value=score, + comment=f"timeline_creates={len(tl_creates)}") + + if lf: lf.flush() assert score == 1.0, f"Expected at least 1 timeline created, got {len(tl_creates)}" @@ -450,33 +470,37 @@ async def test_2_3_email_to_timeline(): @pytest.mark.eval async def test_2_4_project_matching_filename(): """2.4 Filename contains 'ProjectAlpha' → LLM assigns to proj-alpha. Score: runner.project_filename.""" + from contextlib import nullcontext lf = get_langfuse() - trace = lf.trace(name="eval-runner-2.4-project-filename", metadata={"step": "2"}) if lf else None config = _make_config() run_log = _make_run_log(config.id) mgr = _make_manager() - executor, calls = _make_executor( file_path="/emails/ProjectAlpha_report.html", file_content=_ACTION_EMAIL, projects=[_PROJECT_ALPHA, _PROJECT_BETA], ) - with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ - patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock): - await run_local_agent(_USER_ID, config, run_log, mgr) + obs_ctx = lf.start_as_current_observation( + name="eval-runner-2.4-project-filename", metadata={"step": "2"} + ) if lf else nullcontext() - # Check that project_id = proj-alpha was used in any insert - inserts = [c for c in calls if c["action"] == "insert"] - correct_project = any( - c.get("data", {}).get("projectId") == "proj-alpha" - for c in inserts - ) - score = 1.0 if correct_project else 0.0 + with obs_ctx as obs: + with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ + patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock): + await run_local_agent(_USER_ID, config, run_log, mgr) - if lf and trace: - lf.score(trace_id=trace.id, name="runner.project_filename", value=score) + inserts = [c for c in calls if c["action"] == "insert"] + correct_project = any( + c.get("data", {}).get("projectId") == "proj-alpha" for c in inserts + ) + score = 1.0 if correct_project else 0.0 + + if obs is not None: + obs.score(name="runner.project_filename", value=score) + + if lf: lf.flush() assert score == 1.0, "Expected inserts to use proj-alpha based on filename" @@ -486,32 +510,37 @@ async def test_2_4_project_matching_filename(): @pytest.mark.eval async def test_2_5_project_matching_content(): """2.5 Email body mentions 'Project Alpha' → correct project assigned. Score: runner.project_content.""" + from contextlib import nullcontext lf = get_langfuse() - trace = lf.trace(name="eval-runner-2.5-project-content", metadata={"step": "2"}) if lf else None config = _make_config() run_log = _make_run_log(config.id) mgr = _make_manager() - executor, calls = _make_executor( file_path="/emails/email_001.html", # generic filename, no project hint - file_content=_ACTION_EMAIL, # body mentions "Project Alpha" + file_content=_ACTION_EMAIL, # body mentions "Project Alpha" projects=[_PROJECT_ALPHA, _PROJECT_BETA], ) - with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ - patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock): - await run_local_agent(_USER_ID, config, run_log, mgr) + obs_ctx = lf.start_as_current_observation( + name="eval-runner-2.5-project-content", metadata={"step": "2"} + ) if lf else nullcontext() - inserts = [c for c in calls if c["action"] == "insert"] - correct_project = any( - c.get("data", {}).get("projectId") == "proj-alpha" - for c in inserts - ) - score = 1.0 if correct_project else 0.0 + with obs_ctx as obs: + with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ + patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock): + await run_local_agent(_USER_ID, config, run_log, mgr) - if lf and trace: - lf.score(trace_id=trace.id, name="runner.project_content", value=score) + inserts = [c for c in calls if c["action"] == "insert"] + correct_project = any( + c.get("data", {}).get("projectId") == "proj-alpha" for c in inserts + ) + score = 1.0 if correct_project else 0.0 + + if obs is not None: + obs.score(name="runner.project_content", value=score) + + if lf: lf.flush() assert score == 1.0, "Expected inserts to use proj-alpha based on email body content" @@ -521,30 +550,35 @@ async def test_2_5_project_matching_content(): @pytest.mark.eval async def test_2_6_no_project_match_global_rule(): """2.6 Newsletter email + global rule 'no project = no entities' → no creates. Score: runner.no_project.""" + from contextlib import nullcontext lf = get_langfuse() - trace = lf.trace(name="eval-runner-2.6-no-project", metadata={"step": "2"}) if lf else None config = _make_config() run_log = _make_run_log(config.id) mgr = _make_manager() - executor, calls = _make_executor( file_path="/emails/newsletter.html", file_content=_NO_PROJECT_EMAIL, projects=[_PROJECT_ALPHA, _PROJECT_BETA], ) - with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ - patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock) as mock_fin: - await run_local_agent(_USER_ID, config, run_log, mgr) + obs_ctx = lf.start_as_current_observation( + name="eval-runner-2.6-no-project", metadata={"step": "2"} + ) if lf else nullcontext() - _, kwargs = mock_fin.call_args - inserts = [c for c in calls if c["action"] == "insert"] - score = 1.0 if len(inserts) == 0 else 0.0 + with obs_ctx as obs: + with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ + patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock): + await run_local_agent(_USER_ID, config, run_log, mgr) - if lf and trace: - lf.score(trace_id=trace.id, name="runner.no_project", value=score, - comment=f"inserts={len(inserts)}") + inserts = [c for c in calls if c["action"] == "insert"] + score = 1.0 if len(inserts) == 0 else 0.0 + + if obs is not None: + obs.score(name="runner.no_project", value=score, + comment=f"inserts={len(inserts)}") + + if lf: lf.flush() assert score == 1.0, f"Expected 0 inserts for unmatched newsletter, got {len(inserts)}" @@ -554,32 +588,37 @@ async def test_2_6_no_project_match_global_rule(): @pytest.mark.eval async def test_2_7_deduplication(): """2.7 Existing task with same title → LLM calls update_task, not create_task. Score: runner.dedup.""" + from contextlib import nullcontext lf = get_langfuse() - trace = lf.trace(name="eval-runner-2.7-dedup", metadata={"step": "2"}) if lf else None config = _make_config() run_log = _make_run_log(config.id) mgr = _make_manager() - executor, calls = _make_executor( file_path="/emails/ProjectAlpha_followup.html", - file_content=_ACTION_EMAIL, # "Fix the login bug" — already exists + file_content=_ACTION_EMAIL, # "Fix the login bug" — already exists projects=[_PROJECT_ALPHA], existing_tasks=[_EXISTING_TASK], # task already exists ) - with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ - patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock): - await run_local_agent(_USER_ID, config, run_log, mgr) + obs_ctx = lf.start_as_current_observation( + name="eval-runner-2.7-dedup", metadata={"step": "2"} + ) if lf else nullcontext() - task_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "tasks"] - task_updates = [c for c in calls if c["action"] == "update" and c.get("table") == "tasks"] - # Prefer update over create - score = 1.0 if len(task_creates) == 0 or len(task_updates) >= 1 else 0.0 + with obs_ctx as obs: + with patch("app.core.agent_runner._make_agent_executor", return_value=executor), \ + patch("app.core.agent_runner._finalize_run", new_callable=AsyncMock): + await run_local_agent(_USER_ID, config, run_log, mgr) - if lf and trace: - lf.score(trace_id=trace.id, name="runner.dedup", value=score, - comment=f"creates={len(task_creates)} updates={len(task_updates)}") + task_creates = [c for c in calls if c["action"] == "insert" and c["table"] == "tasks"] + task_updates = [c for c in calls if c["action"] == "update" and c.get("table") == "tasks"] + score = 1.0 if len(task_creates) == 0 or len(task_updates) >= 1 else 0.0 + + if obs is not None: + obs.score(name="runner.dedup", value=score, + comment=f"creates={len(task_creates)} updates={len(task_updates)}") + + if lf: lf.flush() assert score == 1.0, (