# Agent Runner V2 — eval test cases (Step 2, requires real LLM) # # Each case drives one parametrized `test_eval_runner` invocation. # # Keys # ---- # id: str unique identifier shown in pytest output # description: str human-readable label # file: str filename inside data/ # file_path: str path reported to the executor (affects project-matching via filename) # projects: [alpha|beta] symbolic project names resolved by the test helper # # Optional pre-existing records (dedup tests) # existing_tasks: list of {id, title, status, priority} # existing_notes: list of {id, title, content} # existing_timelines: list of {id, title, date} # # Assertions (one or more) # expect_insert: at least 1 insert row in this table (tasks|notes|timelines) # expect_no_insert: true zero inserts in any table # expect_project_id: any insert must carry this projectId # expect_dedup: true task inserts == 0 OR task updates >= 1 (dedup check) # # Langfuse # score_name: str observation score name - id: "2.1" description: "Action email → create_task" file: email_action.html file_path: /emails/ProjectAlpha_action.html projects: [alpha, beta] expect_insert: tasks score_name: runner.email_to_task - id: "2.2" description: "Informational email → create_note" file: email_info.html file_path: /emails/ProjectAlpha_info.html projects: [alpha, beta] expect_insert: notes score_name: runner.email_to_note - id: "2.3" description: "Email with meeting date → create_timeline" file: email_date.html file_path: /emails/ProjectAlpha_kickoff.html projects: [alpha, beta] expect_insert: timelines score_name: runner.email_to_timeline - id: "2.4" description: "Filename contains project name → correct project assigned" file: email_action.html file_path: /emails/ProjectAlpha_report.html projects: [alpha, beta] expect_project_id: proj-alpha score_name: runner.project_filename - id: "2.5" description: "Email body mentions project → correct project assigned" file: email_action.html file_path: /emails/email_001.html projects: [alpha, beta] expect_project_id: proj-alpha score_name: runner.project_content - id: "2.6" description: "Newsletter + global rule no-project → no creates" file: email_no_project.html file_path: /emails/newsletter.html projects: [alpha, beta] expect_no_insert: true score_name: runner.no_project - id: "2.7" description: "Existing task with same title → dedup (update not create)" file: email_action.html file_path: /emails/ProjectAlpha_followup.html projects: [alpha] existing_tasks: - id: task-existing title: Fix the login bug status: todo priority: medium expect_dedup: true score_name: runner.dedup