Langfuse V3 does not accept user_id/session_id on observation-level calls. Moved to metadata dict in agent_runner, deep_agent, and agent_setup. refactor(tests): fixture-based pattern for agent_runner_v2 eval tests - cases.yaml + data/ fixtures under tests/fixtures/agent_runner_v2/ - pytest_generate_tests parametrizes test_eval_runner from YAML - _resolve_projects() handles symbolic names and inline dicts - _evaluate_case() centralizes all assertion logic - --runner-dir CLI option for custom fixture folders Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
87 lines
2.8 KiB
YAML
87 lines
2.8 KiB
YAML
# Agent Runner V2 — eval test cases (Step 2, requires real LLM)
|
|
#
|
|
# Each case drives one parametrized `test_eval_runner` invocation.
|
|
#
|
|
# Keys
|
|
# ----
|
|
# id: str unique identifier shown in pytest output
|
|
# description: str human-readable label
|
|
# file: str filename inside data/
|
|
# file_path: str path reported to the executor (affects project-matching via filename)
|
|
# projects: [alpha|beta] symbolic project names resolved by the test helper
|
|
#
|
|
# Optional pre-existing records (dedup tests)
|
|
# existing_tasks: list of {id, title, status, priority}
|
|
# existing_notes: list of {id, title, content}
|
|
# existing_timelines: list of {id, title, date}
|
|
#
|
|
# Assertions (one or more)
|
|
# expect_insert: <table> at least 1 insert row in this table (tasks|notes|timelines)
|
|
# expect_no_insert: true zero inserts in any table
|
|
# expect_project_id: <id> any insert must carry this projectId
|
|
# expect_dedup: true task inserts == 0 OR task updates >= 1 (dedup check)
|
|
#
|
|
# Langfuse
|
|
# score_name: str observation score name
|
|
|
|
- id: "2.1"
|
|
description: "Action email → create_task"
|
|
file: email_action.html
|
|
file_path: /emails/ProjectAlpha_action.html
|
|
projects: [alpha, beta]
|
|
expect_insert: tasks
|
|
score_name: runner.email_to_task
|
|
|
|
- id: "2.2"
|
|
description: "Informational email → create_note"
|
|
file: email_info.html
|
|
file_path: /emails/ProjectAlpha_info.html
|
|
projects: [alpha, beta]
|
|
expect_insert: notes
|
|
score_name: runner.email_to_note
|
|
|
|
- id: "2.3"
|
|
description: "Email with meeting date → create_timeline"
|
|
file: email_date.html
|
|
file_path: /emails/ProjectAlpha_kickoff.html
|
|
projects: [alpha, beta]
|
|
expect_insert: timelines
|
|
score_name: runner.email_to_timeline
|
|
|
|
- id: "2.4"
|
|
description: "Filename contains project name → correct project assigned"
|
|
file: email_action.html
|
|
file_path: /emails/ProjectAlpha_report.html
|
|
projects: [alpha, beta]
|
|
expect_project_id: proj-alpha
|
|
score_name: runner.project_filename
|
|
|
|
- id: "2.5"
|
|
description: "Email body mentions project → correct project assigned"
|
|
file: email_action.html
|
|
file_path: /emails/email_001.html
|
|
projects: [alpha, beta]
|
|
expect_project_id: proj-alpha
|
|
score_name: runner.project_content
|
|
|
|
- id: "2.6"
|
|
description: "Newsletter + global rule no-project → no creates"
|
|
file: email_no_project.html
|
|
file_path: /emails/newsletter.html
|
|
projects: [alpha, beta]
|
|
expect_no_insert: true
|
|
score_name: runner.no_project
|
|
|
|
- id: "2.7"
|
|
description: "Existing task with same title → dedup (update not create)"
|
|
file: email_action.html
|
|
file_path: /emails/ProjectAlpha_followup.html
|
|
projects: [alpha]
|
|
existing_tasks:
|
|
- id: task-existing
|
|
title: Fix the login bug
|
|
status: todo
|
|
priority: medium
|
|
expect_dedup: true
|
|
score_name: runner.dedup
|