diff --git a/tests/fixtures/journey_v2/cases.yaml b/tests/fixtures/journey_v2/cases.yaml index 32ac4b4..df6ef07 100644 --- a/tests/fixtures/journey_v2/cases.yaml +++ b/tests/fixtures/journey_v2/cases.yaml @@ -1,19 +1,11 @@ # Journey V2 eval test cases — Step 4 # -# Each case simulates a complete journey session: -# 1. handle_journey_start is called with directory + data_types -# 2. handle_journey_message is called for each entry in user_messages -# 3. Assertions are evaluated on the final reply -# -# directory_files: list of {path, content_file} — content_file is relative to data/ +# Only case 4.1 is kept as an automated eval. Cases 4.2–4.5 (multi-turn +# conversations that expect the LLM to produce a complete AgentConfig) +# are non-deterministic and tested manually — results tracked in Langfuse. # # Assertion keys: -# expect_question: true → first reply must contain "?" -# expect_done: true → final reply must have done=True -# expect_valid_config: true → agent_config must be parseable as AgentConfig with content_types > 0 -# expect_content_type_id: → AgentConfig.content_types must contain an entry with this id -# expect_extraction_contains: → first content_type extraction_prompt must contain this word -# expect_global_rules: true → AgentConfig.global_rules must be non-empty +# expect_question: true → first reply must contain "?" - id: "4.1" description: "Journey start explores directory, first reply contains a question" @@ -25,63 +17,3 @@ user_messages: [] score_name: "journey.start" expect_question: true - -- id: "4.2" - description: "Full 3-turn conversation produces a valid AgentConfig JSON" - directory: "/test/emails" - data_types: ["tasks", "notes", "timelines"] - directory_files: - - path: "/test/emails/email_backup.html" - content_file: "email_action.html" - user_messages: - - "These are email exports from Outlook in HTML format" - - "Create tasks for emails with direct action requests, notes for informational emails" - - "Yes, that looks correct. No other rules." - score_name: "journey.valid_json" - expect_done: true - expect_valid_config: true - -- id: "4.3" - description: "Journey detects email_html content type from directory exploration" - directory: "/test/emails" - data_types: ["tasks", "notes"] - directory_files: - - path: "/test/emails/message.html" - content_file: "email_action.html" - user_messages: - - "HTML email backups from my mail client, exported from Outlook" - - "Create tasks from emails that contain assignments or direct action items" - - "Correct, no other rules needed" - score_name: "journey.detect_email" - expect_done: true - expect_content_type_id: "email_html" - -- id: "4.4" - description: "Custom user rule (only notes, no tasks) reflected in extraction_prompt" - directory: "/test/emails" - data_types: ["notes"] - directory_files: - - path: "/test/emails/email.html" - content_file: "email_info.html" - user_messages: - - "HTML emails from my work inbox" - - "Create only notes from all emails — I do not want tasks or timelines to be created" - - "Yes, exactly" - score_name: "journey.custom_rules" - expect_done: true - expect_extraction_contains: "note" - -- id: "4.5" - description: "Global rule (no project = no entity) appears in AgentConfig.global_rules" - directory: "/test/emails" - data_types: ["tasks", "notes"] - directory_files: - - path: "/test/emails/email.html" - content_file: "email_action.html" - user_messages: - - "Email backups from Outlook" - - "Create tasks from action request emails, notes from informational emails" - - "If the email cannot be matched to any project, do not create any entity at all" - score_name: "journey.global_rules" - expect_done: true - expect_global_rules: true diff --git a/tests/test_journey_v2.py b/tests/test_journey_v2.py index 3cce9af..9c09f6c 100644 --- a/tests/test_journey_v2.py +++ b/tests/test_journey_v2.py @@ -12,16 +12,17 @@ Unit tests (no LLM) 4.6e Session not found → done=True, agent_config=None 4.6f Nudge uses AGENT_CONFIG_START/END markers (not old PROMPT_TEMPLATE) -Eval tests (real LLM + Langfuse scoring) ------------------------------------------ -Cases are defined in tests/fixtures/journey_v2/cases.yaml. -Email HTML files live in tests/fixtures/journey_v2/data/. -Use --journey-dir to point at a custom folder (same structure required). +Eval test (real LLM + Langfuse scoring) +---------------------------------------- + 4.1 Journey start explores directory → first reply contains a question + +Cases 4.2–4.5 (multi-turn conversations producing a full AgentConfig) are +non-deterministic and tested manually — results tracked in Langfuse. Run: pytest tests/test_journey_v2.py -v pytest tests/test_journey_v2.py -v -k "4_6" # unit only - pytest tests/test_journey_v2.py -v -k "eval" # LLM evals only + pytest tests/test_journey_v2.py -v -k "eval" # single LLM eval pytest tests/test_journey_v2.py -v --journey-dir /p # custom fixtures """ @@ -170,57 +171,6 @@ def _evaluate_case(case: dict, reply: dict) -> tuple[float, str]: has_q = "?" in reply.get("message", "") return (1.0 if has_q else 0.0), f"first_reply_has_question={has_q}" - if case.get("expect_done") and not reply.get("done"): - return 0.0, "expected done=True but journey did not complete" - - agent_config_raw = reply.get("agent_config") - - if case.get("expect_valid_config"): - if not agent_config_raw: - return 0.0, "agent_config is None" - try: - parsed = AgentConfig.model_validate_json(agent_config_raw) - valid = len(parsed.content_types) > 0 - return (1.0 if valid else 0.0), f"content_types={len(parsed.content_types)}" - except Exception as exc: - return 0.0, f"parse error: {exc}" - - if case.get("expect_content_type_id"): - expected_id = case["expect_content_type_id"] - if not agent_config_raw: - return 0.0, "agent_config is None" - try: - parsed = AgentConfig.model_validate_json(agent_config_raw) - ids = [ct.id for ct in parsed.content_types] - found = expected_id in ids - return (1.0 if found else 0.0), f"content_type_ids={ids}, expected={expected_id}" - except Exception as exc: - return 0.0, f"parse error: {exc}" - - if case.get("expect_extraction_contains"): - keyword = case["expect_extraction_contains"].lower() - if not agent_config_raw: - return 0.0, "agent_config is None" - try: - parsed = AgentConfig.model_validate_json(agent_config_raw) - if not parsed.content_types: - return 0.0, "no content_types in config" - prompt = parsed.content_types[0].extraction_prompt.lower() - found = keyword in prompt - return (1.0 if found else 0.0), f"keyword='{keyword}' in extraction_prompt={found}" - except Exception as exc: - return 0.0, f"parse error: {exc}" - - if case.get("expect_global_rules"): - if not agent_config_raw: - return 0.0, "agent_config is None" - try: - parsed = AgentConfig.model_validate_json(agent_config_raw) - has_rules = len(parsed.global_rules) > 0 - return (1.0 if has_rules else 0.0), f"global_rules={parsed.global_rules}" - except Exception as exc: - return 0.0, f"parse error: {exc}" - return 1.0, "no specific assertion"