refactor(tests): remove non-deterministic journey eval cases 4.2–4.5

Keep only 4.1 (first reply contains question) as automated eval. Multi-turn cases (4.2–4.5) are non-deterministic and tested manually with results tracked in Langfuse. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-08 09:41:43 +02:00
parent 467abc8d42
commit c0aef71141
2 changed files with 11 additions and 129 deletions
--- a/tests/fixtures/journey_v2/cases.yaml
+++ b/tests/fixtures/journey_v2/cases.yaml
@@ -1,19 +1,11 @@
 # Journey V2 eval test cases — Step 4
 #
-# Each case simulates a complete journey session:
-#   1. handle_journey_start is called with directory + data_types
-#   2. handle_journey_message is called for each entry in user_messages
-#   3. Assertions are evaluated on the final reply
-#
-# directory_files: list of {path, content_file} — content_file is relative to data/
+# Only case 4.1 is kept as an automated eval. Cases 4.2–4.5 (multi-turn
+# conversations that expect the LLM to produce a complete AgentConfig)
+# are non-deterministic and tested manually — results tracked in Langfuse.
 #
 # Assertion keys:
-#   expect_question: true          → first reply must contain "?"
-#   expect_done: true              → final reply must have done=True
-#   expect_valid_config: true      → agent_config must be parseable as AgentConfig with content_types > 0
-#   expect_content_type_id: <str>  → AgentConfig.content_types must contain an entry with this id
-#   expect_extraction_contains: <str> → first content_type extraction_prompt must contain this word
-#   expect_global_rules: true      → AgentConfig.global_rules must be non-empty
+#   expect_question: true → first reply must contain "?"

 - id: "4.1"
  description: "Journey start explores directory, first reply contains a question"
@@ -25,63 +17,3 @@
  user_messages: []
  score_name: "journey.start"
  expect_question: true
-
- id: "4.2"
-  description: "Full 3-turn conversation produces a valid AgentConfig JSON"
-  directory: "/test/emails"
-  data_types: ["tasks", "notes", "timelines"]
-  directory_files:
-    - path: "/test/emails/email_backup.html"
-      content_file: "email_action.html"
-  user_messages:
-    - "These are email exports from Outlook in HTML format"
-    - "Create tasks for emails with direct action requests, notes for informational emails"
-    - "Yes, that looks correct. No other rules."
-  score_name: "journey.valid_json"
-  expect_done: true
-  expect_valid_config: true
-
- id: "4.3"
-  description: "Journey detects email_html content type from directory exploration"
-  directory: "/test/emails"
-  data_types: ["tasks", "notes"]
-  directory_files:
-    - path: "/test/emails/message.html"
-      content_file: "email_action.html"
-  user_messages:
-    - "HTML email backups from my mail client, exported from Outlook"
-    - "Create tasks from emails that contain assignments or direct action items"
-    - "Correct, no other rules needed"
-  score_name: "journey.detect_email"
-  expect_done: true
-  expect_content_type_id: "email_html"
-
- id: "4.4"
-  description: "Custom user rule (only notes, no tasks) reflected in extraction_prompt"
-  directory: "/test/emails"
-  data_types: ["notes"]
-  directory_files:
-    - path: "/test/emails/email.html"
-      content_file: "email_info.html"
-  user_messages:
-    - "HTML emails from my work inbox"
-    - "Create only notes from all emails — I do not want tasks or timelines to be created"
-    - "Yes, exactly"
-  score_name: "journey.custom_rules"
-  expect_done: true
-  expect_extraction_contains: "note"
-
- id: "4.5"
-  description: "Global rule (no project = no entity) appears in AgentConfig.global_rules"
-  directory: "/test/emails"
-  data_types: ["tasks", "notes"]
-  directory_files:
-    - path: "/test/emails/email.html"
-      content_file: "email_action.html"
-  user_messages:
-    - "Email backups from Outlook"
-    - "Create tasks from action request emails, notes from informational emails"
-    - "If the email cannot be matched to any project, do not create any entity at all"
-  score_name: "journey.global_rules"
-  expect_done: true
-  expect_global_rules: true