diff --git a/app/api/routes/agent_setup.py b/app/api/routes/agent_setup.py index 2efe891..8545429 100644 --- a/app/api/routes/agent_setup.py +++ b/app/api/routes/agent_setup.py @@ -1,11 +1,11 @@ -"""Chatbot Journey — WS-based guided conversation to build an agent prompt_template. +"""Chatbot Journey — WS-based guided conversation to build an AgentConfig. The journey is driven entirely through WebSocket frames (no REST endpoints). The device WS handler dispatches ``journey_start`` and ``journey_message`` frames to the functions exported here. Journey flow: - 1. FE sends ``journey_start`` frame with basic agent config (directory, + 1. FE sends ``journey_start`` frame with basic agent info (directory, data_types, schedule). 2. Server creates an in-memory session, sets up a WS executor so the setup LLM can use file-system tools, does a first directory scrape, @@ -13,10 +13,11 @@ Journey flow: 3. FE sends ``journey_message`` frames for each user reply. 4. Server appends the user message, calls the LLM (which may read files via tools), and sends back a ``journey_reply``. - 5. After 3-5 turns the LLM wraps up by emitting a ``prompt_template`` - block delimited by ``PROMPT_TEMPLATE_START`` / ``PROMPT_TEMPLATE_END``. - 6. Server parses the block, sends ``journey_reply`` with ``done=True`` - and the template. FE stores it locally. + 5. After 3-5 turns the LLM wraps up by emitting an ``AgentConfig`` JSON + block delimited by ``AGENT_CONFIG_START`` / ``AGENT_CONFIG_END``. + 6. Server parses and validates the JSON with Pydantic, sends + ``journey_reply`` with ``done=True`` and the serialised config. + FE stores it locally. """ from __future__ import annotations @@ -34,6 +35,7 @@ from app.agents.filesystem_agent import FILESYSTEM_TOOLS from app.config.settings import settings from app.core.langfuse_client import compile_prompt, extract_usage, get_langfuse, get_prompt_or_fallback from app.core.llm import get_llm +from app.schemas import AgentConfig logger = logging.getLogger(__name__) @@ -41,9 +43,9 @@ logger = logging.getLogger(__name__) _SESSION_TTL_SECONDS: int = 1800 # 30 minutes -# Sentinel strings used to delimit the LLM-produced prompt_template. -_TEMPLATE_START = "PROMPT_TEMPLATE_START" -_TEMPLATE_END = "PROMPT_TEMPLATE_END" +# Sentinel strings used to delimit the LLM-produced AgentConfig JSON. +_CONFIG_START = "AGENT_CONFIG_START" +_CONFIG_END = "AGENT_CONFIG_END" # Minimum turns before we consider nudging the LLM to wrap up. _MIN_TURNS_BEFORE_NUDGE: int = 3 @@ -86,61 +88,76 @@ def get_journey_session(session_id: str, user_id: str) -> JourneySession | None: return s -# ── System prompt builder ───────────────────────────────────────────────── +# ── System prompt ───────────────────────────────────────────────────────── _JOURNEY_SYSTEM_PROMPT = """\ You are a friendly assistant helping a freelancer configure a data-extraction agent. -Your job is to understand exactly what data the user wants to extract from their -local directory and produce a detailed prompt_template that a separate AI will use -as its instruction set. - -The extraction agent already has this base behaviour built in: - - Reads each file using file-system tools. - - Creates records (tasks, notes, timelines, projects) via CRUD tools. - - Sets isAiSuggested=1 on every new record. - - Only extracts data explicitly present in the files — it never invents information. -The user's custom prompt is appended AFTER this base behaviour, so focus on -what to look for and how to map it — not on the general extraction mechanics. +Your job is to understand what files the user has in their directory and produce a +structured AgentConfig JSON that the extraction agent will use as its instruction set. You have access to file-system tools to explore the user's directory: -- list_directory: to see folder structure -- read_file_content: to peek at file contents -- get_file_metadata: to check file info +- list_directory: see folder structure and file names +- read_file_content: peek at a file's content +- get_file_metadata: check file size, extension, dates The user's configured directory is: {directory} Target data types: {data_types} -IMPORTANT — project assignment is handled automatically by the main agent runner -before the custom prompt is ever used. You MUST NOT ask the user about projects, -projectId, or how to link records to projects. Never include projectId logic or -project creation instructions in the generated prompt_template. +## Your process -Start by exploring the directory to understand its structure. Then ask concise, -focused questions one at a time. Cover these topics (not necessarily in this order): - 1. The type and format of the source content (confirmed by your exploration). - 2. How fields should be mapped (e.g. filename → task title). - 3. Priority or status rules (e.g. "urgent" keyword → high priority). - 4. Any special handling, date extraction, or exclusions. +### Step 1 — Explore the directory +Use list_directory and read_file_content to understand what types of files are present +(HTML emails, plain-text documents, CSVs, etc.). -Once you reach 90% confidence, output the final prompt_template between these exact -markers on their own lines: +### Step 2 — Identify content types +For each distinct file type found, decide: +- A short id (e.g. "email_html", "plain_text", "csv") +- Which preprocessing handler to use: "email_html" for HTML emails, "generic" for everything else +- A human-readable label and optional detection_hint -{template_start} - -{template_end} +### Step 3 — Ask focused questions (one at a time) +Cover these topics based on what you discovered: +1. How to map content to entity types (task / note / timeline entry) +2. Field mapping rules (e.g. email Subject → task title, filename → note title) +3. Priority or status rules (e.g. "urgent" in subject → high priority) +4. Date extraction (e.g. "by Friday" → dueDate) +5. Exclusion rules (e.g. skip newsletters, skip files with no project match) -The prompt_template must be a self-contained instruction for an AI that reads files -and must perform CRUD operations using tools to create records. It should specify: - - What entity types to create (tasks, notes, timelines) — never projects. - - How to map file content to record fields (camelCase: title, status, priority, - dueDate, content, etc.) — never include projectId. - - That isAiSuggested must be set to 1 on every new record. - - Concrete examples of mappings based on what you discovered in the directory. +### Step 4 — Produce the AgentConfig JSON +Once you are ≥ 90% confident, output the final config between these exact markers +(each on its own line): + +{config_start} +{{ + "content_types": [ + {{ + "id": "email_html", + "label": "Email HTML", + "detection_hint": "HTML file with From/To/Subject headers", + "preprocessing": "email_html", + "extraction_prompt": "Detailed extraction instructions for this content type..." + }} + ], + "global_rules": [ + "If the file cannot be matched to any project, do not create any entity." + ], + "data_types": {data_types_json} +}} +{config_end} + +## Rules for the extraction_prompt field +- Describe when to create a task vs note vs timeline entry (be specific and concrete) +- Include field mapping rules based on what you found in the directory +- Include priority/status/date rules if applicable +- Do NOT include projectId logic — the runner handles project assignment automatically +- Do NOT mention isAiSuggested — the runner always sets it to 1 + +## Constraints +- Never ask about projects, projectId, or how to link records to projects +- Never include projectId or project creation logic in the generated config +- Keep asking questions until ≥ 90% confident, then output the JSON immediately {existing_section}\ -Keep asking clarifying questions until you are at least 90% confident you have -enough information to generate an accurate prompt_template. Once you reach that -confidence level, stop asking and produce the final template immediately. Begin by exploring the directory, then ask your first question.\ """ @@ -148,40 +165,53 @@ Begin by exploring the directory, then ask your first question.\ def _build_system_prompt( directory: str, data_types: list[str], - existing_template: str | None = None, + existing_config: str | None = None, ) -> tuple[str, Any]: """Return ``(compiled_system_prompt, langfuse_prompt_obj_or_None)``.""" existing_section = ( - f"\nThe user already has the following prompt_template — refine it based on their answers:\n" - f"---\n{existing_template}\n---\n" - if existing_template + "\nThe user already has the following AgentConfig — refine it based on their answers:\n" + f"```json\n{existing_config}\n```\n" + if existing_config else "" ) template, prompt_obj = get_prompt_or_fallback( - "journey_system", _JOURNEY_SYSTEM_PROMPT + "journey_system_v2", _JOURNEY_SYSTEM_PROMPT ) compiled = compile_prompt( template, prompt_obj, directory=directory, data_types=", ".join(data_types), - template_start=_TEMPLATE_START, - template_end=_TEMPLATE_END, + data_types_json=json.dumps(data_types), + config_start=_CONFIG_START, + config_end=_CONFIG_END, existing_section=existing_section, ) return compiled, prompt_obj -# ── Template extraction ─────────────────────────────────────────────────── +# ── AgentConfig extraction ──────────────────────────────────────────────── -def _extract_template(text: str) -> str | None: - """Return the text between PROMPT_TEMPLATE_START and PROMPT_TEMPLATE_END, or None.""" - if _TEMPLATE_START not in text or _TEMPLATE_END not in text: +def _extract_agent_config(text: str) -> str | None: + """Return validated AgentConfig JSON string from between markers, or None. + + Parses the JSON with Pydantic to ensure it conforms to the schema before + returning. Returns None if markers are absent or JSON is invalid. + """ + if _CONFIG_START not in text or _CONFIG_END not in text: + return None + start_idx = text.index(_CONFIG_START) + len(_CONFIG_START) + end_idx = text.index(_CONFIG_END) + raw = text[start_idx:end_idx].strip() + if not raw: + return None + try: + parsed = AgentConfig.model_validate_json(raw) + return parsed.model_dump_json() + except Exception as exc: + logger.warning("agent_setup: failed to parse AgentConfig JSON: %s", exc) return None - start_idx = text.index(_TEMPLATE_START) + len(_TEMPLATE_START) - end_idx = text.index(_TEMPLATE_END) - return text[start_idx:end_idx].strip() or None # ── LLM call with tool support ─────────────────────────────────────────── @@ -235,8 +265,7 @@ async def _call_llm_with_tools( lf.start_as_current_observation( as_type="span", name="journey-setup", - user_id=user_id or None, - session_id=session_id or None, + metadata={"user_id": user_id or None, "session_id": session_id or None}, input=history[-1]["content"] if history else "", ) if lf else None @@ -318,12 +347,12 @@ async def handle_journey_start( agent_type = frame.get("agent_type", "local") directory = frame.get("directory", "") data_types = frame.get("data_types", []) - existing_template = frame.get("existing_template") + existing_config = frame.get("existing_config") # Use the session_id provided by the FE so the reply matches the # listener key; fall back to a generated one if absent. session_id = frame.get("session_id") or str(uuid.uuid4()) - system_prompt, langfuse_prompt = _build_system_prompt(directory, data_types, existing_template) + system_prompt, langfuse_prompt = _build_system_prompt(directory, data_types, existing_config) session = JourneySession( session_id=session_id, @@ -335,10 +364,8 @@ async def handle_journey_start( langfuse_prompt=langfuse_prompt, ) - # The LLM will explore the directory using FILESYSTEM_TOOLS via the - # ws_context executor (already set by the WS handler before calling us). - # Seed with an initial user message — some providers (e.g. GitHub Copilot) - # require at least one user/input message to be present. + # Seed with an initial user message — some providers require at least one + # user/input message to be present. seed_history: list[dict[str, Any]] = [ {"role": "user", "content": "Hi, I'm ready to set up my agent. Please explore my directory and ask me your first question."}, ] @@ -362,14 +389,14 @@ async def handle_journey_start( directory, ) - # Check if the LLM produced the template on the first turn (unlikely but possible). - prompt_template = _extract_template(ai_reply) - done = prompt_template is not None + # Check if the LLM produced the config on the first turn (unlikely but possible). + agent_config = _extract_agent_config(ai_reply) + done = agent_config is not None display_message = ai_reply if done: display_message = ( - ai_reply[: ai_reply.index(_TEMPLATE_START)].strip() + ai_reply[: ai_reply.index(_CONFIG_START)].strip() or "Here is your agent configuration. You can save it or continue refining." ) _sessions.pop(session_id, None) @@ -379,7 +406,7 @@ async def handle_journey_start( "session_id": session_id, "message": display_message, "done": done, - "prompt_template": prompt_template, + "agent_config": agent_config, } @@ -402,7 +429,7 @@ async def handle_journey_message( "session_id": session_id, "message": "Journey session not found or expired. Please start a new setup.", "done": True, - "prompt_template": None, + "agent_config": None, } # Append user turn. @@ -420,18 +447,17 @@ async def handle_journey_message( session.history.append({"role": "assistant", "content": ai_reply}) - # Check if the LLM produced the final template. - prompt_template = _extract_template(ai_reply) - done = prompt_template is not None + # Check if the LLM produced the final config. + agent_config = _extract_agent_config(ai_reply) + done = agent_config is not None - # If the LLM didn't produce a template, nudge it once it has asked enough - # questions (>= _MIN_TURNS_BEFORE_NUDGE) or hits the hard safety cap. + # If the LLM didn't produce a config, nudge it once it hits the hard safety cap. if not done: turns = sum(1 for t in session.history if t["role"] == "user") if turns >= _MAX_TURNS: nudge_content = ( "[System: You have enough information. Please generate the final " - f"prompt_template now, wrapped in {_TEMPLATE_START} / {_TEMPLATE_END} markers.]" + f"AgentConfig JSON now, wrapped in {_CONFIG_START} / {_CONFIG_END} markers.]" ) session.history.append({"role": "user", "content": nudge_content}) @@ -445,16 +471,16 @@ async def handle_journey_message( ) session.history.append({"role": "assistant", "content": nudge_reply}) - prompt_template = _extract_template(nudge_reply) - if prompt_template is not None: + agent_config = _extract_agent_config(nudge_reply) + if agent_config is not None: done = True ai_reply = nudge_reply display_message = ai_reply if done: display_message = ( - ai_reply[: ai_reply.index(_TEMPLATE_START)].strip() - if _TEMPLATE_START in ai_reply + ai_reply[: ai_reply.index(_CONFIG_START)].strip() + if _CONFIG_START in ai_reply else "Here is your agent configuration. You can save it or continue refining." ) _sessions.pop(session_id, None) @@ -465,5 +491,5 @@ async def handle_journey_message( "session_id": session_id, "message": display_message, "done": done, - "prompt_template": prompt_template, + "agent_config": agent_config, } diff --git a/tests/conftest.py b/tests/conftest.py index 52a4e7e..fdef3ad 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,26 +6,21 @@ a per-test session, and a FastAPI ``TestClient`` wired to use it. from __future__ import annotations -import json -import os import time import uuid from collections.abc import AsyncGenerator, Generator -from unittest.mock import patch -import boto3 import pytest import pytest_asyncio from fastapi.testclient import TestClient from jose import jwt -from moto import mock_aws from sqlalchemy import StaticPool, event from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine from app.config.settings import settings from app.db import Base, get_session from app.main import app -from app.models import Plugin, Subscription, User +from app.models import Subscription, User # ── Fixed test user IDs (one per tier) ─────────────────────────────── @@ -109,79 +104,6 @@ def client(db_session: AsyncSession) -> Generator[TestClient, None, None]: # n app.dependency_overrides.pop(get_session, None) -# ── Seed data helpers ──────────────────────────────────────────────── - -_SEED_PLUGINS = [ - Plugin( - id="plugin-github-sync", - name="GitHub Sync", - description="Sync tasks with GitHub Issues and pull requests.", - version="1.0.0", - author_name="Adiuva", - category="productivity", - price_cents=0, - permissions=json.dumps(["read:tasks", "write:tasks"]), - status="approved", - s3_package_key="plugins/plugin-github-sync/1.0.0/package.zip", - install_count=0, - avg_rating=0.0, - ), - Plugin( - id="plugin-slack-notify", - name="Slack Notifier", - description="Post task and timeline updates to Slack channels.", - version="1.2.0", - author_name="Adiuva", - category="communication", - price_cents=499, - permissions=json.dumps(["read:tasks", "read:timelines"]), - status="approved", - s3_package_key="plugins/plugin-slack-notify/1.2.0/package.zip", - install_count=0, - avg_rating=0.0, - ), - Plugin( - id="plugin-time-tracker", - name="Time Tracker", - description="Track time spent on tasks with automatic reporting.", - version="0.9.1", - author_name="Third Party", - category="productivity", - price_cents=999, - permissions=json.dumps(["read:tasks", "write:tasks"]), - status="approved", - s3_package_key="plugins/plugin-time-tracker/0.9.1/package.zip", - install_count=0, - avg_rating=0.0, - ), -] - - -@pytest_asyncio.fixture -async def seed_plugins(db_session: AsyncSession) -> list[Plugin]: - """Insert the 3 default approved plugins and return them.""" - plugins = [] - for template in _SEED_PLUGINS: - p = Plugin( - id=template.id, - name=template.name, - description=template.description, - version=template.version, - author_name=template.author_name, - category=template.category, - price_cents=template.price_cents, - permissions=template.permissions, - status=template.status, - s3_package_key=template.s3_package_key, - install_count=template.install_count, - avg_rating=template.avg_rating, - ) - db_session.add(p) - plugins.append(p) - await db_session.commit() - return plugins - - # ── JWT helpers ────────────────────────────────────────────────────── @@ -212,29 +134,6 @@ def auth_header(tier: str = "power", user_id: str | None = None) -> dict[str, st return {"Authorization": f"Bearer {make_jwt(tier, user_id)}"} -# ── S3 mock fixture ────────────────────────────────────────────────── - -S3_TEST_BUCKET = "test-bucket" -S3_TEST_REGION = "us-east-1" - - -@pytest.fixture -def s3_bucket(): - """Create a mocked S3 bucket via moto and patch BlobStore settings.""" - with mock_aws(): - os.environ.setdefault("AWS_ACCESS_KEY_ID", "testing") - os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "testing") - os.environ.setdefault("AWS_DEFAULT_REGION", S3_TEST_REGION) - client = boto3.client("s3", region_name=S3_TEST_REGION) - client.create_bucket(Bucket=S3_TEST_BUCKET) - with patch("app.storage.blob_store.settings") as mock_settings: - mock_settings.S3_BUCKET = S3_TEST_BUCKET - mock_settings.S3_REGION = S3_TEST_REGION - mock_settings.AWS_ACCESS_KEY_ID = "testing" - mock_settings.AWS_SECRET_ACCESS_KEY = "testing" - yield S3_TEST_BUCKET - - # ── CLI options ─────────────────────────────────────────────────────── def pytest_addoption(parser): @@ -243,3 +142,13 @@ def pytest_addoption(parser): default=None, help="Override fixture folder for preprocessor tests (must contain cases.yaml + data/)", ) + parser.addoption( + "--runner-dir", + default=None, + help="Override fixture folder for agent_runner_v2 eval tests (must contain cases.yaml + data/)", + ) + parser.addoption( + "--journey-dir", + default=None, + help="Override fixture folder for journey_v2 eval tests (must contain cases.yaml + data/)", + ) diff --git a/tests/fixtures/journey_v2/cases.yaml b/tests/fixtures/journey_v2/cases.yaml new file mode 100644 index 0000000..32ac4b4 --- /dev/null +++ b/tests/fixtures/journey_v2/cases.yaml @@ -0,0 +1,87 @@ +# Journey V2 eval test cases — Step 4 +# +# Each case simulates a complete journey session: +# 1. handle_journey_start is called with directory + data_types +# 2. handle_journey_message is called for each entry in user_messages +# 3. Assertions are evaluated on the final reply +# +# directory_files: list of {path, content_file} — content_file is relative to data/ +# +# Assertion keys: +# expect_question: true → first reply must contain "?" +# expect_done: true → final reply must have done=True +# expect_valid_config: true → agent_config must be parseable as AgentConfig with content_types > 0 +# expect_content_type_id: → AgentConfig.content_types must contain an entry with this id +# expect_extraction_contains: → first content_type extraction_prompt must contain this word +# expect_global_rules: true → AgentConfig.global_rules must be non-empty + +- id: "4.1" + description: "Journey start explores directory, first reply contains a question" + directory: "/test/emails" + data_types: ["tasks", "notes", "timelines"] + directory_files: + - path: "/test/emails/outlook_export_2024.html" + content_file: "email_action.html" + user_messages: [] + score_name: "journey.start" + expect_question: true + +- id: "4.2" + description: "Full 3-turn conversation produces a valid AgentConfig JSON" + directory: "/test/emails" + data_types: ["tasks", "notes", "timelines"] + directory_files: + - path: "/test/emails/email_backup.html" + content_file: "email_action.html" + user_messages: + - "These are email exports from Outlook in HTML format" + - "Create tasks for emails with direct action requests, notes for informational emails" + - "Yes, that looks correct. No other rules." + score_name: "journey.valid_json" + expect_done: true + expect_valid_config: true + +- id: "4.3" + description: "Journey detects email_html content type from directory exploration" + directory: "/test/emails" + data_types: ["tasks", "notes"] + directory_files: + - path: "/test/emails/message.html" + content_file: "email_action.html" + user_messages: + - "HTML email backups from my mail client, exported from Outlook" + - "Create tasks from emails that contain assignments or direct action items" + - "Correct, no other rules needed" + score_name: "journey.detect_email" + expect_done: true + expect_content_type_id: "email_html" + +- id: "4.4" + description: "Custom user rule (only notes, no tasks) reflected in extraction_prompt" + directory: "/test/emails" + data_types: ["notes"] + directory_files: + - path: "/test/emails/email.html" + content_file: "email_info.html" + user_messages: + - "HTML emails from my work inbox" + - "Create only notes from all emails — I do not want tasks or timelines to be created" + - "Yes, exactly" + score_name: "journey.custom_rules" + expect_done: true + expect_extraction_contains: "note" + +- id: "4.5" + description: "Global rule (no project = no entity) appears in AgentConfig.global_rules" + directory: "/test/emails" + data_types: ["tasks", "notes"] + directory_files: + - path: "/test/emails/email.html" + content_file: "email_action.html" + user_messages: + - "Email backups from Outlook" + - "Create tasks from action request emails, notes from informational emails" + - "If the email cannot be matched to any project, do not create any entity at all" + score_name: "journey.global_rules" + expect_done: true + expect_global_rules: true diff --git a/tests/fixtures/journey_v2/data/email_action.html b/tests/fixtures/journey_v2/data/email_action.html new file mode 100644 index 0000000..2ba1437 --- /dev/null +++ b/tests/fixtures/journey_v2/data/email_action.html @@ -0,0 +1,23 @@ + + + + + Email: Fix the login bug + + + +
+

From: boss@company.com

+

To: dev@company.com

+

Subject: Fix the login bug

+

Date: Mon, 7 Apr 2026 09:15:00 +0000

+
+
+

Hi,

+

Please fix the login bug in Project Alpha as soon as possible. + Users are reporting that they can't log in with their Google accounts. + This is blocking the whole team. Please resolve it by Friday.

+

Thanks,
Boss

+
+ + diff --git a/tests/fixtures/journey_v2/data/email_info.html b/tests/fixtures/journey_v2/data/email_info.html new file mode 100644 index 0000000..a84aa3c --- /dev/null +++ b/tests/fixtures/journey_v2/data/email_info.html @@ -0,0 +1,23 @@ + + + + + Email: New policy update + + + +
+

From: hr@company.com

+

To: all@company.com

+

Subject: FYI: New remote work policy effective May 1

+

Date: Tue, 8 Apr 2026 10:00:00 +0000

+
+
+

Hi everyone,

+

Just a heads-up that starting May 1, 2026 the company will be moving to + a hybrid work model. You will be expected to come into the office at least + two days per week. More details will follow in the employee handbook.

+

Best,
HR Team

+
+ + diff --git a/tests/test_journey_v2.py b/tests/test_journey_v2.py new file mode 100644 index 0000000..3cce9af --- /dev/null +++ b/tests/test_journey_v2.py @@ -0,0 +1,349 @@ +"""Tests for Local Agent V2 journey setup (Step 4). + +Covers the chatbot journey that produces a structured AgentConfig JSON +instead of a freeform prompt_template string. + +Unit tests (no LLM) +-------------------- + 4.6a _extract_agent_config: valid JSON → returns serialised config + 4.6b _extract_agent_config: invalid JSON → returns None + 4.6c _extract_agent_config: markers absent → returns None + 4.6d _extract_agent_config: only START marker → returns None + 4.6e Session not found → done=True, agent_config=None + 4.6f Nudge uses AGENT_CONFIG_START/END markers (not old PROMPT_TEMPLATE) + +Eval tests (real LLM + Langfuse scoring) +----------------------------------------- +Cases are defined in tests/fixtures/journey_v2/cases.yaml. +Email HTML files live in tests/fixtures/journey_v2/data/. +Use --journey-dir to point at a custom folder (same structure required). + +Run: + pytest tests/test_journey_v2.py -v + pytest tests/test_journey_v2.py -v -k "4_6" # unit only + pytest tests/test_journey_v2.py -v -k "eval" # LLM evals only + pytest tests/test_journey_v2.py -v --journey-dir /p # custom fixtures +""" + +from __future__ import annotations + +import uuid +from contextlib import nullcontext +from pathlib import Path +from typing import Any +from unittest.mock import patch + +import pytest +import yaml + +from app.api.routes.agent_setup import ( + _CONFIG_END, + _CONFIG_START, + _MAX_TURNS, + _extract_agent_config, + _sessions, + handle_journey_message, + handle_journey_start, +) +from app.core.langfuse_client import get_langfuse +from app.core.ws_context import clear_client_executor, set_client_executor +from app.schemas import AgentConfig +from tests.conftest import TEST_USER_IDS + +# ── Constants ───────────────────────────────────────────────────────────── + +_USER_ID = TEST_USER_IDS["power"] + +_DEFAULT_FIXTURE_DIR = Path(__file__).parent / "fixtures" / "journey_v2" + +# ── Fixture loading ─────────────────────────────────────────────────────── + + +def _fixtures_dir(config) -> Path: + override = config.getoption("--journey-dir") + return Path(override) if override else _DEFAULT_FIXTURE_DIR + + +def _load_cases(config) -> list[dict]: + return yaml.safe_load( + (_fixtures_dir(config) / "cases.yaml").read_text(encoding="utf-8") + ) + + +def _read_data_file(filename: str, fixtures_dir: Path) -> str: + return (fixtures_dir / "data" / filename).read_text(encoding="utf-8") + + +# ── pytest_generate_tests ───────────────────────────────────────────────── + + +def pytest_generate_tests(metafunc): + if "journey_case" not in metafunc.fixturenames: + return + cases = _load_cases(metafunc.config) + metafunc.parametrize("journey_case", cases, ids=[c["id"] for c in cases]) + + +# ── Executor builder ────────────────────────────────────────────────────── + + +def _make_fs_executor(directory_files: list[dict], fixtures_dir: Path): + """Return an async callback that simulates filesystem tool responses. + + Matches the signature expected by ``set_client_executor`` / ``execute_on_client``: + receives the full ``payload`` dict and returns a result dict. + + ``directory_files`` is a list of ``{path, content_file}`` dicts; + ``content_file`` is relative to ``fixtures_dir/data/``. + """ + file_map: dict[str, str] = { + entry["path"]: _read_data_file(entry["content_file"], fixtures_dir) + for entry in directory_files + } + + async def _executor(payload: dict) -> dict: + action = payload.get("action", "") + data = payload.get("data") or {} + + if action == "list_directory": + return {"entries": [ + {"type": "file", "name": p.split("/")[-1], "path": p} + for p in file_map + ]} + + if action == "read_file_content": + path = data.get("path", "") + return {"content": file_map.get(path, "")} + + if action == "get_file_metadata": + path = data.get("path", "") + name = path.split("/")[-1] + ext = "." + name.rsplit(".", 1)[-1] if "." in name else "" + return {"name": name, "extension": ext, "size": 1024, + "createdAt": None, "modifiedAt": None} + + return {} + + return _executor + + +# ── Journey runner helper ───────────────────────────────────────────────── + + +async def _run_journey(user_id: str, case: dict, executor) -> dict[str, Any]: + """Drive start + all user_messages for a case. Returns the final reply dict. + + Mirrors ``device_ws._handle_journey_start/message``: sets the client + executor (so filesystem tools work) before each handler call. + """ + session_id = str(uuid.uuid4()) + try: + set_client_executor(executor) + reply = await handle_journey_start(user_id, { + "agent_type": "local", + "directory": case["directory"], + "data_types": case["data_types"], + "session_id": session_id, + }) + + for msg in case.get("user_messages", []): + if reply.get("done"): + break + set_client_executor(executor) + reply = await handle_journey_message(user_id, { + "session_id": reply["session_id"], + "message": msg, + }) + finally: + clear_client_executor() + _sessions.pop(session_id, None) + + return reply + + +# ── Assertion helper ────────────────────────────────────────────────────── + + +def _evaluate_case(case: dict, reply: dict) -> tuple[float, str]: + """Return (score, comment) for a journey case given the final reply dict.""" + if case.get("expect_question"): + has_q = "?" in reply.get("message", "") + return (1.0 if has_q else 0.0), f"first_reply_has_question={has_q}" + + if case.get("expect_done") and not reply.get("done"): + return 0.0, "expected done=True but journey did not complete" + + agent_config_raw = reply.get("agent_config") + + if case.get("expect_valid_config"): + if not agent_config_raw: + return 0.0, "agent_config is None" + try: + parsed = AgentConfig.model_validate_json(agent_config_raw) + valid = len(parsed.content_types) > 0 + return (1.0 if valid else 0.0), f"content_types={len(parsed.content_types)}" + except Exception as exc: + return 0.0, f"parse error: {exc}" + + if case.get("expect_content_type_id"): + expected_id = case["expect_content_type_id"] + if not agent_config_raw: + return 0.0, "agent_config is None" + try: + parsed = AgentConfig.model_validate_json(agent_config_raw) + ids = [ct.id for ct in parsed.content_types] + found = expected_id in ids + return (1.0 if found else 0.0), f"content_type_ids={ids}, expected={expected_id}" + except Exception as exc: + return 0.0, f"parse error: {exc}" + + if case.get("expect_extraction_contains"): + keyword = case["expect_extraction_contains"].lower() + if not agent_config_raw: + return 0.0, "agent_config is None" + try: + parsed = AgentConfig.model_validate_json(agent_config_raw) + if not parsed.content_types: + return 0.0, "no content_types in config" + prompt = parsed.content_types[0].extraction_prompt.lower() + found = keyword in prompt + return (1.0 if found else 0.0), f"keyword='{keyword}' in extraction_prompt={found}" + except Exception as exc: + return 0.0, f"parse error: {exc}" + + if case.get("expect_global_rules"): + if not agent_config_raw: + return 0.0, "agent_config is None" + try: + parsed = AgentConfig.model_validate_json(agent_config_raw) + has_rules = len(parsed.global_rules) > 0 + return (1.0 if has_rules else 0.0), f"global_rules={parsed.global_rules}" + except Exception as exc: + return 0.0, f"parse error: {exc}" + + return 1.0, "no specific assertion" + + +# ── Unit tests ──────────────────────────────────────────────────────────── + + +def test_4_6a_extract_valid_json(): + """_extract_agent_config: valid JSON between markers → returns serialised config.""" + config = AgentConfig( + content_types=[], + global_rules=["No project = no entity"], + data_types=["tasks"], + ) + text = f"Some preamble\n{_CONFIG_START}\n{config.model_dump_json()}\n{_CONFIG_END}\nTrailing" + result = _extract_agent_config(text) + assert result is not None + parsed = AgentConfig.model_validate_json(result) + assert parsed.global_rules == ["No project = no entity"] + + +def test_4_6b_extract_invalid_json(): + """_extract_agent_config: malformed JSON between markers → returns None.""" + text = f"{_CONFIG_START}\n{{not: valid json\n{_CONFIG_END}" + assert _extract_agent_config(text) is None + + +def test_4_6c_extract_markers_absent(): + """_extract_agent_config: no markers at all → returns None.""" + assert _extract_agent_config("No markers here at all") is None + + +def test_4_6d_extract_only_start_marker(): + """_extract_agent_config: START without END → returns None.""" + assert _extract_agent_config(f"text {_CONFIG_START} no end marker") is None + + +@pytest.mark.asyncio +async def test_4_6e_session_not_found(): + """4.6e Session not found → done=True, agent_config=None, informative message.""" + reply = await handle_journey_message(_USER_ID, { + "session_id": "nonexistent-session-id", + "message": "Hello", + }) + assert reply["done"] is True + assert reply["agent_config"] is None + assert "not found" in reply["message"].lower() or "expired" in reply["message"].lower() + + +@pytest.mark.asyncio +async def test_4_6f_nudge_uses_new_markers(): + """4.6f Nudge injected after max turns uses AGENT_CONFIG markers, not PROMPT_TEMPLATE.""" + session_id = str(uuid.uuid4()) + captured_histories: list[list[dict]] = [] + + async def _mock_llm(system_prompt, history, tools, **kwargs) -> str: + captured_histories.append(list(history)) + # Return plain text — no markers — to trigger the nudge path. + return "I still need more information from you." + + from app.api.routes.agent_setup import JourneySession + + fake_session = JourneySession( + session_id=session_id, + user_id=_USER_ID, + agent_type="local", + directory="/test", + data_types=["tasks"], + system_prompt="system", + langfuse_prompt=None, + ) + # Fill history to the turn limit so the next message triggers the nudge. + for i in range(_MAX_TURNS): + fake_session.history.append({"role": "user", "content": f"msg {i}"}) + fake_session.history.append({"role": "assistant", "content": "ok"}) + _sessions[session_id] = fake_session + + try: + with patch("app.api.routes.agent_setup._call_llm_with_tools", side_effect=_mock_llm): + await handle_journey_message(_USER_ID, { + "session_id": session_id, + "message": "one more message to trigger nudge", + }) + finally: + _sessions.pop(session_id, None) + + # Second LLM call receives the nudge appended to history. + assert len(captured_histories) >= 2, "Expected ≥ 2 LLM calls (main reply + nudge)" + nudge_history = captured_histories[1] + user_msgs = " ".join(t["content"] for t in nudge_history if t["role"] == "user") + assert _CONFIG_START in user_msgs, f"Nudge must reference {_CONFIG_START}" + assert _CONFIG_END in user_msgs, f"Nudge must reference {_CONFIG_END}" + assert "PROMPT_TEMPLATE" not in user_msgs, "Old PROMPT_TEMPLATE markers must not appear in nudge" + + +# ── Eval tests (real LLM + Langfuse) ───────────────────────────────────── + + +@pytest.mark.asyncio +@pytest.mark.eval +async def test_eval_journey(journey_case, pytestconfig): + """Parametrized eval test — one invocation per YAML case.""" + case: dict = journey_case + fixtures_dir = _fixtures_dir(pytestconfig) + executor = _make_fs_executor(case.get("directory_files", []), fixtures_dir) + + lf = get_langfuse() + obs_ctx = lf.start_as_current_observation( + name=f"eval-journey-{case['id']}-{case.get('score_name', 'unknown').replace('.', '-')}", + metadata={"step": "4", "case_id": case["id"]}, + ) if lf else nullcontext() + + with obs_ctx as obs: + reply = await _run_journey(_USER_ID, case, executor) + score, comment = _evaluate_case(case, reply) + + if obs is not None: + obs.score( + name=case.get("score_name", f"journey.case_{case['id']}"), + value=score, + comment=comment, + ) + + if lf: + lf.flush() + + assert score == 1.0, f"[{case['id']}] {case.get('description', '')} — {comment}"