refactor(tests): remove non-deterministic journey eval cases 4.2–4.5
Keep only 4.1 (first reply contains question) as automated eval. Multi-turn cases (4.2–4.5) are non-deterministic and tested manually with results tracked in Langfuse. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
76
tests/fixtures/journey_v2/cases.yaml
vendored
76
tests/fixtures/journey_v2/cases.yaml
vendored
@@ -1,19 +1,11 @@
|
|||||||
# Journey V2 eval test cases — Step 4
|
# Journey V2 eval test cases — Step 4
|
||||||
#
|
#
|
||||||
# Each case simulates a complete journey session:
|
# Only case 4.1 is kept as an automated eval. Cases 4.2–4.5 (multi-turn
|
||||||
# 1. handle_journey_start is called with directory + data_types
|
# conversations that expect the LLM to produce a complete AgentConfig)
|
||||||
# 2. handle_journey_message is called for each entry in user_messages
|
# are non-deterministic and tested manually — results tracked in Langfuse.
|
||||||
# 3. Assertions are evaluated on the final reply
|
|
||||||
#
|
|
||||||
# directory_files: list of {path, content_file} — content_file is relative to data/
|
|
||||||
#
|
#
|
||||||
# Assertion keys:
|
# Assertion keys:
|
||||||
# expect_question: true → first reply must contain "?"
|
# expect_question: true → first reply must contain "?"
|
||||||
# expect_done: true → final reply must have done=True
|
|
||||||
# expect_valid_config: true → agent_config must be parseable as AgentConfig with content_types > 0
|
|
||||||
# expect_content_type_id: <str> → AgentConfig.content_types must contain an entry with this id
|
|
||||||
# expect_extraction_contains: <str> → first content_type extraction_prompt must contain this word
|
|
||||||
# expect_global_rules: true → AgentConfig.global_rules must be non-empty
|
|
||||||
|
|
||||||
- id: "4.1"
|
- id: "4.1"
|
||||||
description: "Journey start explores directory, first reply contains a question"
|
description: "Journey start explores directory, first reply contains a question"
|
||||||
@@ -25,63 +17,3 @@
|
|||||||
user_messages: []
|
user_messages: []
|
||||||
score_name: "journey.start"
|
score_name: "journey.start"
|
||||||
expect_question: true
|
expect_question: true
|
||||||
|
|
||||||
- id: "4.2"
|
|
||||||
description: "Full 3-turn conversation produces a valid AgentConfig JSON"
|
|
||||||
directory: "/test/emails"
|
|
||||||
data_types: ["tasks", "notes", "timelines"]
|
|
||||||
directory_files:
|
|
||||||
- path: "/test/emails/email_backup.html"
|
|
||||||
content_file: "email_action.html"
|
|
||||||
user_messages:
|
|
||||||
- "These are email exports from Outlook in HTML format"
|
|
||||||
- "Create tasks for emails with direct action requests, notes for informational emails"
|
|
||||||
- "Yes, that looks correct. No other rules."
|
|
||||||
score_name: "journey.valid_json"
|
|
||||||
expect_done: true
|
|
||||||
expect_valid_config: true
|
|
||||||
|
|
||||||
- id: "4.3"
|
|
||||||
description: "Journey detects email_html content type from directory exploration"
|
|
||||||
directory: "/test/emails"
|
|
||||||
data_types: ["tasks", "notes"]
|
|
||||||
directory_files:
|
|
||||||
- path: "/test/emails/message.html"
|
|
||||||
content_file: "email_action.html"
|
|
||||||
user_messages:
|
|
||||||
- "HTML email backups from my mail client, exported from Outlook"
|
|
||||||
- "Create tasks from emails that contain assignments or direct action items"
|
|
||||||
- "Correct, no other rules needed"
|
|
||||||
score_name: "journey.detect_email"
|
|
||||||
expect_done: true
|
|
||||||
expect_content_type_id: "email_html"
|
|
||||||
|
|
||||||
- id: "4.4"
|
|
||||||
description: "Custom user rule (only notes, no tasks) reflected in extraction_prompt"
|
|
||||||
directory: "/test/emails"
|
|
||||||
data_types: ["notes"]
|
|
||||||
directory_files:
|
|
||||||
- path: "/test/emails/email.html"
|
|
||||||
content_file: "email_info.html"
|
|
||||||
user_messages:
|
|
||||||
- "HTML emails from my work inbox"
|
|
||||||
- "Create only notes from all emails — I do not want tasks or timelines to be created"
|
|
||||||
- "Yes, exactly"
|
|
||||||
score_name: "journey.custom_rules"
|
|
||||||
expect_done: true
|
|
||||||
expect_extraction_contains: "note"
|
|
||||||
|
|
||||||
- id: "4.5"
|
|
||||||
description: "Global rule (no project = no entity) appears in AgentConfig.global_rules"
|
|
||||||
directory: "/test/emails"
|
|
||||||
data_types: ["tasks", "notes"]
|
|
||||||
directory_files:
|
|
||||||
- path: "/test/emails/email.html"
|
|
||||||
content_file: "email_action.html"
|
|
||||||
user_messages:
|
|
||||||
- "Email backups from Outlook"
|
|
||||||
- "Create tasks from action request emails, notes from informational emails"
|
|
||||||
- "If the email cannot be matched to any project, do not create any entity at all"
|
|
||||||
score_name: "journey.global_rules"
|
|
||||||
expect_done: true
|
|
||||||
expect_global_rules: true
|
|
||||||
|
|||||||
@@ -12,16 +12,17 @@ Unit tests (no LLM)
|
|||||||
4.6e Session not found → done=True, agent_config=None
|
4.6e Session not found → done=True, agent_config=None
|
||||||
4.6f Nudge uses AGENT_CONFIG_START/END markers (not old PROMPT_TEMPLATE)
|
4.6f Nudge uses AGENT_CONFIG_START/END markers (not old PROMPT_TEMPLATE)
|
||||||
|
|
||||||
Eval tests (real LLM + Langfuse scoring)
|
Eval test (real LLM + Langfuse scoring)
|
||||||
-----------------------------------------
|
----------------------------------------
|
||||||
Cases are defined in tests/fixtures/journey_v2/cases.yaml.
|
4.1 Journey start explores directory → first reply contains a question
|
||||||
Email HTML files live in tests/fixtures/journey_v2/data/.
|
|
||||||
Use --journey-dir to point at a custom folder (same structure required).
|
Cases 4.2–4.5 (multi-turn conversations producing a full AgentConfig) are
|
||||||
|
non-deterministic and tested manually — results tracked in Langfuse.
|
||||||
|
|
||||||
Run:
|
Run:
|
||||||
pytest tests/test_journey_v2.py -v
|
pytest tests/test_journey_v2.py -v
|
||||||
pytest tests/test_journey_v2.py -v -k "4_6" # unit only
|
pytest tests/test_journey_v2.py -v -k "4_6" # unit only
|
||||||
pytest tests/test_journey_v2.py -v -k "eval" # LLM evals only
|
pytest tests/test_journey_v2.py -v -k "eval" # single LLM eval
|
||||||
pytest tests/test_journey_v2.py -v --journey-dir /p # custom fixtures
|
pytest tests/test_journey_v2.py -v --journey-dir /p # custom fixtures
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -170,57 +171,6 @@ def _evaluate_case(case: dict, reply: dict) -> tuple[float, str]:
|
|||||||
has_q = "?" in reply.get("message", "")
|
has_q = "?" in reply.get("message", "")
|
||||||
return (1.0 if has_q else 0.0), f"first_reply_has_question={has_q}"
|
return (1.0 if has_q else 0.0), f"first_reply_has_question={has_q}"
|
||||||
|
|
||||||
if case.get("expect_done") and not reply.get("done"):
|
|
||||||
return 0.0, "expected done=True but journey did not complete"
|
|
||||||
|
|
||||||
agent_config_raw = reply.get("agent_config")
|
|
||||||
|
|
||||||
if case.get("expect_valid_config"):
|
|
||||||
if not agent_config_raw:
|
|
||||||
return 0.0, "agent_config is None"
|
|
||||||
try:
|
|
||||||
parsed = AgentConfig.model_validate_json(agent_config_raw)
|
|
||||||
valid = len(parsed.content_types) > 0
|
|
||||||
return (1.0 if valid else 0.0), f"content_types={len(parsed.content_types)}"
|
|
||||||
except Exception as exc:
|
|
||||||
return 0.0, f"parse error: {exc}"
|
|
||||||
|
|
||||||
if case.get("expect_content_type_id"):
|
|
||||||
expected_id = case["expect_content_type_id"]
|
|
||||||
if not agent_config_raw:
|
|
||||||
return 0.0, "agent_config is None"
|
|
||||||
try:
|
|
||||||
parsed = AgentConfig.model_validate_json(agent_config_raw)
|
|
||||||
ids = [ct.id for ct in parsed.content_types]
|
|
||||||
found = expected_id in ids
|
|
||||||
return (1.0 if found else 0.0), f"content_type_ids={ids}, expected={expected_id}"
|
|
||||||
except Exception as exc:
|
|
||||||
return 0.0, f"parse error: {exc}"
|
|
||||||
|
|
||||||
if case.get("expect_extraction_contains"):
|
|
||||||
keyword = case["expect_extraction_contains"].lower()
|
|
||||||
if not agent_config_raw:
|
|
||||||
return 0.0, "agent_config is None"
|
|
||||||
try:
|
|
||||||
parsed = AgentConfig.model_validate_json(agent_config_raw)
|
|
||||||
if not parsed.content_types:
|
|
||||||
return 0.0, "no content_types in config"
|
|
||||||
prompt = parsed.content_types[0].extraction_prompt.lower()
|
|
||||||
found = keyword in prompt
|
|
||||||
return (1.0 if found else 0.0), f"keyword='{keyword}' in extraction_prompt={found}"
|
|
||||||
except Exception as exc:
|
|
||||||
return 0.0, f"parse error: {exc}"
|
|
||||||
|
|
||||||
if case.get("expect_global_rules"):
|
|
||||||
if not agent_config_raw:
|
|
||||||
return 0.0, "agent_config is None"
|
|
||||||
try:
|
|
||||||
parsed = AgentConfig.model_validate_json(agent_config_raw)
|
|
||||||
has_rules = len(parsed.global_rules) > 0
|
|
||||||
return (1.0 if has_rules else 0.0), f"global_rules={parsed.global_rules}"
|
|
||||||
except Exception as exc:
|
|
||||||
return 0.0, f"parse error: {exc}"
|
|
||||||
|
|
||||||
return 1.0, "no specific assertion"
|
return 1.0, "no specific assertion"
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user