refactor(tests): remove non-deterministic journey eval cases 4.2–4.5
Keep only 4.1 (first reply contains question) as automated eval. Multi-turn cases (4.2–4.5) are non-deterministic and tested manually with results tracked in Langfuse. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -12,16 +12,17 @@ Unit tests (no LLM)
|
||||
4.6e Session not found → done=True, agent_config=None
|
||||
4.6f Nudge uses AGENT_CONFIG_START/END markers (not old PROMPT_TEMPLATE)
|
||||
|
||||
Eval tests (real LLM + Langfuse scoring)
|
||||
-----------------------------------------
|
||||
Cases are defined in tests/fixtures/journey_v2/cases.yaml.
|
||||
Email HTML files live in tests/fixtures/journey_v2/data/.
|
||||
Use --journey-dir to point at a custom folder (same structure required).
|
||||
Eval test (real LLM + Langfuse scoring)
|
||||
----------------------------------------
|
||||
4.1 Journey start explores directory → first reply contains a question
|
||||
|
||||
Cases 4.2–4.5 (multi-turn conversations producing a full AgentConfig) are
|
||||
non-deterministic and tested manually — results tracked in Langfuse.
|
||||
|
||||
Run:
|
||||
pytest tests/test_journey_v2.py -v
|
||||
pytest tests/test_journey_v2.py -v -k "4_6" # unit only
|
||||
pytest tests/test_journey_v2.py -v -k "eval" # LLM evals only
|
||||
pytest tests/test_journey_v2.py -v -k "eval" # single LLM eval
|
||||
pytest tests/test_journey_v2.py -v --journey-dir /p # custom fixtures
|
||||
"""
|
||||
|
||||
@@ -170,57 +171,6 @@ def _evaluate_case(case: dict, reply: dict) -> tuple[float, str]:
|
||||
has_q = "?" in reply.get("message", "")
|
||||
return (1.0 if has_q else 0.0), f"first_reply_has_question={has_q}"
|
||||
|
||||
if case.get("expect_done") and not reply.get("done"):
|
||||
return 0.0, "expected done=True but journey did not complete"
|
||||
|
||||
agent_config_raw = reply.get("agent_config")
|
||||
|
||||
if case.get("expect_valid_config"):
|
||||
if not agent_config_raw:
|
||||
return 0.0, "agent_config is None"
|
||||
try:
|
||||
parsed = AgentConfig.model_validate_json(agent_config_raw)
|
||||
valid = len(parsed.content_types) > 0
|
||||
return (1.0 if valid else 0.0), f"content_types={len(parsed.content_types)}"
|
||||
except Exception as exc:
|
||||
return 0.0, f"parse error: {exc}"
|
||||
|
||||
if case.get("expect_content_type_id"):
|
||||
expected_id = case["expect_content_type_id"]
|
||||
if not agent_config_raw:
|
||||
return 0.0, "agent_config is None"
|
||||
try:
|
||||
parsed = AgentConfig.model_validate_json(agent_config_raw)
|
||||
ids = [ct.id for ct in parsed.content_types]
|
||||
found = expected_id in ids
|
||||
return (1.0 if found else 0.0), f"content_type_ids={ids}, expected={expected_id}"
|
||||
except Exception as exc:
|
||||
return 0.0, f"parse error: {exc}"
|
||||
|
||||
if case.get("expect_extraction_contains"):
|
||||
keyword = case["expect_extraction_contains"].lower()
|
||||
if not agent_config_raw:
|
||||
return 0.0, "agent_config is None"
|
||||
try:
|
||||
parsed = AgentConfig.model_validate_json(agent_config_raw)
|
||||
if not parsed.content_types:
|
||||
return 0.0, "no content_types in config"
|
||||
prompt = parsed.content_types[0].extraction_prompt.lower()
|
||||
found = keyword in prompt
|
||||
return (1.0 if found else 0.0), f"keyword='{keyword}' in extraction_prompt={found}"
|
||||
except Exception as exc:
|
||||
return 0.0, f"parse error: {exc}"
|
||||
|
||||
if case.get("expect_global_rules"):
|
||||
if not agent_config_raw:
|
||||
return 0.0, "agent_config is None"
|
||||
try:
|
||||
parsed = AgentConfig.model_validate_json(agent_config_raw)
|
||||
has_rules = len(parsed.global_rules) > 0
|
||||
return (1.0 if has_rules else 0.0), f"global_rules={parsed.global_rules}"
|
||||
except Exception as exc:
|
||||
return 0.0, f"parse error: {exc}"
|
||||
|
||||
return 1.0, "no specific assertion"
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user