refactor(eval): 3-mode eval harness (step1/step2/full) with Langfuse fixes

- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants - Rewrite runner with _run_step1, _run_step2, _run_full dispatch - CLI: replace --variants with --mode flag - Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full) - Remove old freelance_invoices fixture - Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full) - Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode - Langfuse: post separate classification_precision/recall/f1 scores for full mode - Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1) - Langfuse: include step1_results in trace output - MockExecutor: mock async_session to bypass DB in full mode - Journey fixture: remove user_messages (only interactive test kept)
2026-03-24 16:18:51 +01:00
parent 63fa119543
commit d3f7099d93
13 changed files with 1409 additions and 439 deletions
--- a/services/batch-agent/eval/cli.py
+++ b/services/batch-agent/eval/cli.py
@@ -4,14 +4,15 @@ Usage::

    # From services/batch-agent/:
    python -m eval run                                # all agent fixtures, default model
-    python -m eval run --fixture=freelance-invoices   # single fixture
-    python -m eval run --models=gpt-4o,anthropic/claude-sonnet-4
-    python -m eval run --variants=baseline,detailed   # specific prompt variants
+    python -m eval run --fixture=classify-invoices    # single fixture
+    python -m eval run --models=gpt-4o,gpt-5.3-codex  # multiple models
+    python -m eval run --mode=step1                   # only step1 fixtures
    python -m eval run --no-judge                     # skip LLM judge scoring

-    python -m eval journey                            # all journey fixtures
-    python -m eval journey --fixture=journey-invoices # single journey fixture
-    python -m eval journey --models=gpt-4o,anthropic/claude-sonnet-4
+    python -m eval interactive                        # interactive journey session
+    python -m eval interactive --fixture=journey-invoice-setup
+    python -m eval interactive --model=gpt-4o
+    python -m eval interactive --judge-model=github_copilot/gpt-4o-mini

    python -m eval list                               # list all fixtures
    python -m eval sync                               # sync fixtures to Langfuse datasets
@@ -25,16 +26,24 @@ import logging
 import sys
 from pathlib import Path

-# Ensure the service root and repo root are in sys.path
+# Ensure the service root and repo root are in sys.path.
+# Service root must come BEFORE repo root so its ``app/`` package
+# shadows the monolith ``app/`` in the repo root.
 _SERVICE_ROOT = Path(__file__).resolve().parent.parent
 _REPO_ROOT = _SERVICE_ROOT.parent.parent
-for p in (_SERVICE_ROOT, _REPO_ROOT):
-    if str(p) not in sys.path:
-        sys.path.insert(0, str(p))
+_sr = str(_SERVICE_ROOT)
+_rr = str(_REPO_ROOT)
+if _rr not in sys.path:
+    sys.path.insert(0, _rr)
+# Always force service root to position 0 (python -m may have already
+# added CWD further down the list, which loses to repo root).
+if _sr in sys.path:
+    sys.path.remove(_sr)
+sys.path.insert(0, _sr)

 from eval.config import discover_fixtures, discover_journey_fixtures
 from eval.runner import run_fixture_eval, print_results
-from eval.journey_runner import run_journey_fixture_eval, print_journey_results
+from eval.interactive import run_interactive
 from eval import langfuse_eval


@@ -65,13 +74,14 @@ def _parse_args() -> argparse.Namespace:
    )
    run_cmd.add_argument(
        "--models", "-m",
-        default="gpt-4o",
-        help="Comma-separated list of models to test (default: gpt-4o)",
+        default="github_copilot/gpt-5.3-codex",
+        help="Comma-separated list of models to test (default: github_copilot/gpt-5.3-codex)",
    )
    run_cmd.add_argument(
-        "--variants", "-p",
+        "--mode",
        default=None,
-        help="Comma-separated prompt variants to test (default: all in fixture)",
+        choices=["step1", "step2", "full"],
+        help="Only run fixtures with this mode (default: all)",
    )
    run_cmd.add_argument(
        "--no-judge",
@@ -80,8 +90,8 @@ def _parse_args() -> argparse.Namespace:
    )
    run_cmd.add_argument(
        "--judge-model",
-        default="gpt-4o-mini",
-        help="Model for LLM judge (default: gpt-4o-mini)",
+        default="gpt-4o",
+        help="Model for LLM judge (default: gpt-4o)",
    )
    run_cmd.add_argument(
        "--fixtures-dir",
@@ -95,35 +105,40 @@ def _parse_args() -> argparse.Namespace:
    list_cmd.add_argument("--fixtures-dir", default=None)
    list_cmd.add_argument("-v", "--verbose", action="store_true")

-    # ── journey ───────────────────────────────────────────────────
-    journey_cmd = sub.add_parser("journey", help="Run journey evaluations")
-    journey_cmd.add_argument(
-        "--fixture", "-f",
-        help="Run only the named journey fixture (default: all)",
-    )
-    journey_cmd.add_argument(
-        "--models", "-m",
-        default="gpt-4o",
-        help="Comma-separated list of models to test (default: gpt-4o)",
-    )
-    journey_cmd.add_argument(
-        "--judge-model",
-        default="gpt-4o-mini",
-        help="Model for LLM judge (default: gpt-4o-mini)",
-    )
-    journey_cmd.add_argument(
-        "--fixtures-dir",
-        default=None,
-        help="Path to fixtures directory (default: eval/fixtures/)",
-    )
-    journey_cmd.add_argument("-v", "--verbose", action="store_true")
-
    # ── sync ──────────────────────────────────────────────────────
    sync_cmd = sub.add_parser("sync", help="Sync fixtures to Langfuse datasets")
    sync_cmd.add_argument("--fixture", "-f", default=None, help="Sync only the named fixture")
    sync_cmd.add_argument("--fixtures-dir", default=None)
    sync_cmd.add_argument("-v", "--verbose", action="store_true")

+    # ── interactive ───────────────────────────────────────────────
+    inter_cmd = sub.add_parser("interactive", help="Interactive journey session (human-in-the-loop)")
+    inter_cmd.add_argument(
+        "--fixture", "-f",
+        help="Journey fixture to use (default: pick interactively)",
+    )
+    inter_cmd.add_argument(
+        "--model", "-m",
+        default="github_copilot/gpt-5.3-codex",
+        help="Model for the journey AI (default: github_copilot/gpt-5.3-codex)",
+    )
+    inter_cmd.add_argument(
+        "--judge-model",
+        default="gpt-4o",
+        help="Model for LLM judge (default: gpt-4o)",
+    )
+    inter_cmd.add_argument(
+        "--fixtures-dir",
+        default=None,
+        help="Path to fixtures directory (default: eval/fixtures/)",
+    )
+    inter_cmd.add_argument(
+        "--data-dir",
+        default=None,
+        help="Override sample data directory (e.g. path to private test files not in git)",
+    )
+    inter_cmd.add_argument("-v", "--verbose", action="store_true")
+
    return parser.parse_args()


@@ -146,14 +161,14 @@ async def _cmd_run(args: argparse.Namespace) -> None:
            return

    models = [m.strip() for m in args.models.split(",")]
-    variants = [v.strip() for v in args.variants.split(",")] if args.variants else None

    all_results = []
    for fixture in fixtures:
+        if args.mode and fixture.mode != args.mode:
+            continue
        results = await run_fixture_eval(
            fixture,
            models=models,
-            variants=variants,
            use_llm_judge=not args.no_judge,
            judge_model=args.judge_model,
        )
@@ -172,12 +187,12 @@ def _cmd_list(args: argparse.Namespace) -> None:

    if fixtures:
        print(f"\n{'[Agent Fixtures]'}")
-        print(f"{'Name':<30} {'Types':<25} {'Variants':<20} {'Expected'}")
+        print(f"{'Name':<30} {'Mode':<6} {'Types':<25} {'Expected'}")
        print("-" * 90)
        for f in fixtures:
-            variants = ", ".join(f.prompt_variants.keys())
            types = ", ".join(f.data_types)
-            print(f"{f.name:<30} {types:<25} {variants:<20} {len(f.expected)}")
+            n_expected = len(f.expected) + len(f.expected_classification)
+            print(f"{f.name:<30} {f.mode:<6} {types:<25} {n_expected}")

    if journey_fixtures:
        print(f"\n{'[Journey Fixtures]'}")
@@ -217,30 +232,39 @@ def _cmd_sync(args: argparse.Namespace) -> None:
            print(f"Skipped: {fixture.name} (Langfuse not configured)")


-async def _cmd_journey(args: argparse.Namespace) -> None:
+async def _cmd_interactive(args: argparse.Namespace) -> None:
    journey_fixtures = discover_journey_fixtures(_fixtures_dir(args.fixtures_dir))
    if not journey_fixtures:
        print("No journey fixtures found. Create YAML files with type: journey in eval/fixtures/.")
        return

    if args.fixture:
-        journey_fixtures = [f for f in journey_fixtures if f.name == args.fixture]
-        if not journey_fixtures:
+        fixtures = [f for f in journey_fixtures if f.name == args.fixture]
+        if not fixtures:
            print(f"Journey fixture '{args.fixture}' not found.")
            return
+        fixture = fixtures[0]
+    elif len(journey_fixtures) == 1:
+        fixture = journey_fixtures[0]
+    else:
+        # Let user pick
+        print("\nAvailable journey fixtures:")
+        for i, f in enumerate(journey_fixtures, 1):
+            print(f"  {i}. {f.name} — {f.description[:60]}")
+        print()
+        try:
+            choice = int(input("Pick a fixture number: ").strip()) - 1
+            fixture = journey_fixtures[choice]
+        except (ValueError, IndexError, EOFError, KeyboardInterrupt):
+            print("Invalid choice.")
+            return

-    models = [m.strip() for m in args.models.split(",")]
-
-    all_results = []
-    for fixture in journey_fixtures:
-        results = await run_journey_fixture_eval(
-            fixture,
-            models=models,
-            judge_model=args.judge_model,
-        )
-        all_results.extend(results)
-
-    print_journey_results(all_results)
+    await run_interactive(
+        fixture,
+        model=args.model,
+        judge_model=args.judge_model,
+        data_dir=Path(args.data_dir).resolve() if args.data_dir else None,
+    )


 def main() -> None:
@@ -249,8 +273,8 @@ def main() -> None:

    if args.command == "run":
        asyncio.run(_cmd_run(args))
-    elif args.command == "journey":
-        asyncio.run(_cmd_journey(args))
+    elif args.command == "interactive":
+        asyncio.run(_cmd_interactive(args))
    elif args.command == "list":
        _cmd_list(args)
    elif args.command == "sync":
--- a/services/batch-agent/eval/config.py
+++ b/services/batch-agent/eval/config.py
@@ -1,70 +1,16 @@
 """Eval configuration — YAML fixture loader and dataclasses.

-A *fixture* is a YAML file that defines a complete test scenario:
+Fixtures come in two families:

-.. code-block:: yaml
+1. **Agent fixtures** — test the batch agent pipeline.
+   Three modes controlled by ``mode``:

-    name: freelance-invoices
-    description: Extract tasks and notes from invoice PDFs (text layer)
-    directory: sample_files/invoices      # relative to fixture dir
-    data_types: [tasks, notes]
-    file_extensions: [txt, md]
+   ``step1``  — classification prompt only.
+   ``step2``  — processing prompt only.
+   ``full``   — both steps in sequence.

-    # Preseeded records the agent "sees" as existing data
-    seed_records:
-      projects:
-        - id: proj-1
-          name: "Website Redesign"
-          status: active
-      tasks: []
-
-    # Prompt variations to test (at least one required)
-    prompt_variants:
-      baseline: |
-        Extract action items as tasks and meeting summaries as notes.
-        Set priority based on urgency keywords.
-      detailed: |
-        Extract action items as tasks. Map "URGENT" to high priority,
-        "ASAP" to medium. Summaries become notes with full content.
-
-    # Expected extractions — what the agent SHOULD produce
-    expected:
-      tasks:
-        - title: "Send revised invoice to client"
-          priority: high
-          status: todo
-        - title: "Update project timeline"
-          priority: medium
-      notes:
-        - title: "Meeting summary - March kickoff"
-
-    # Optional: models to test (overrides CLI --models)
-    models: []
-
-A *journey fixture* tests the prompt-template builder conversation:
-
-.. code-block:: yaml
-
-    type: journey
-    name: journey-invoices
-    description: Test journey builds a good template for invoices
-    directory: sample_files/invoices
-    data_types: [tasks, notes]
-
-    # Simulated user responses for multi-turn conversation
-    user_messages:
-      - "I want to extract action items and meeting summaries"
-      - "Yes, map URGENTE to high priority"
-      - "That looks good, generate the template"
-
-    # Criteria the generated prompt_template should satisfy
-    expected_template_criteria:
-      - "mentions tasks and notes as target entities"
-      - "includes priority mapping rules"
-      - "references isAiSuggested=1"
-      - "does not mention projectId"
-
-    models: []
+2. **Journey fixtures** — test the prompt-template builder conversation
+   (unchanged).
 """

 from __future__ import annotations
@@ -72,12 +18,14 @@ from __future__ import annotations
 import logging
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any
+from typing import Any, Literal

 import yaml

 logger = logging.getLogger(__name__)

+EvalMode = Literal["step1", "step2", "full"]
+

@dataclass
 class ExpectedRecord:
@@ -90,21 +38,52 @@ class ExpectedRecord:
    fields: dict[str, Any]  # field_name → expected_value


+@dataclass
+class ExpectedClassification:
+    """Expected output of step-1 classification for one file."""
+
+    file: str  # relative path to the sample file
+    project_id: str  # expected matched project id, or "new"
+    domains: list[str]  # expected domain list
+    new_project_name: str | None = None
+
+
@dataclass
 class EvalFixture:
-    """A complete test scenario loaded from YAML."""
+    """A complete test scenario loaded from YAML.
+
+    ``mode`` determines which pipeline steps are exercised:
+
+    - **step1**: only ``_classify_file``
+    - **step2**: only the processing LLM + tool loop
+    - **full**: both steps in sequence (``run_local_agent``)
+    """

    name: str
    description: str
+    mode: EvalMode
    directory: str  # relative path to sample files
    data_types: list[str]
    file_extensions: list[str]
-    seed_records: dict[str, list[dict]]
-    prompt_variants: dict[str, str]  # variant_name → prompt_template
-    expected: list[ExpectedRecord]
    models: list[str]  # if empty, use CLI default
    fixture_path: Path = field(default_factory=lambda: Path("."))

+    # ── Step-1 inputs (classification) ───────────────────────────
+    domain_definitions: str = ""
+    projects_list: list[dict[str, Any]] = field(default_factory=list)
+
+    # ── Step-2 inputs (processing) ───────────────────────────────
+    existing_context: str = ""
+    project_context: str = ""
+    custom_prompt_section: str = ""
+
+    # ── Seed records for mock executor ───────────────────────────
+    seed_records: dict[str, list[dict]] = field(default_factory=dict)
+
+    # ── Expected outputs ─────────────────────────────────────────
+    expected_classification: list[ExpectedClassification] = field(default_factory=list)
+    expected: list[ExpectedRecord] = field(default_factory=list)
+
    @property
    def fixture_dir(self) -> Path:
        """Absolute path to the sample files directory."""
@@ -115,22 +94,44 @@ class EvalFixture:
        """Load a fixture from a YAML file."""
        raw = yaml.safe_load(path.read_text(encoding="utf-8"))

+        mode: EvalMode = raw.get("mode", "full")
+
+        # Parse expected records (step2/full)
        expected: list[ExpectedRecord] = []
        for table, records in (raw.get("expected") or {}).items():
            for rec in records:
                expected.append(ExpectedRecord(table=table, fields=rec))

+        # Parse expected classification (step1/full)
+        expected_classification: list[ExpectedClassification] = []
+        for item in raw.get("expected_classification") or []:
+            expected_classification.append(ExpectedClassification(
+                file=item["file"],
+                project_id=item["project_id"],
+                domains=item.get("domains", []),
+                new_project_name=item.get("new_project_name"),
+            ))
+
        return cls(
            name=raw["name"],
            description=raw.get("description", ""),
+            mode=mode,
            directory=raw.get("directory", "sample_files"),
            data_types=raw.get("data_types", ["tasks"]),
            file_extensions=raw.get("file_extensions", []),
-            seed_records=raw.get("seed_records", {}),
-            prompt_variants=raw.get("prompt_variants", {"default": ""}),
-            expected=expected,
            models=raw.get("models", []),
            fixture_path=path,
+            # Step-1 inputs
+            domain_definitions=raw.get("domain_definitions", ""),
+            projects_list=raw.get("projects_list", []),
+            # Step-2 inputs
+            existing_context=raw.get("existing_context", ""),
+            project_context=raw.get("project_context", ""),
+            custom_prompt_section=raw.get("custom_prompt_section", ""),
+            # Shared
+            seed_records=raw.get("seed_records", {}),
+            expected_classification=expected_classification,
+            expected=expected,
        )


@@ -168,9 +169,9 @@ class JourneyFixture:
    description: str
    directory: str  # relative path to sample files
    data_types: list[str]
-    user_messages: list[str]  # simulated user responses
    expected_template_criteria: list[str]  # what the template should contain/satisfy
-    models: list[str]
+    user_messages: list[str] = field(default_factory=list)  # for automated journey runs (unused in interactive mode)
+    models: list[str] = field(default_factory=list)
    fixture_path: Path = field(default_factory=lambda: Path("."))

    @property
--- a/services/batch-agent/eval/fixtures/classify_invoices.yaml
+++ b/services/batch-agent/eval/fixtures/classify_invoices.yaml
@@ -0,0 +1,40 @@
+# Fixture: classify-invoices (step1)
+# Tests _STEP1_SYSTEM_PROMPT — file classification and project matching.
+# Verifies that the LLM correctly matches files to existing projects
+# and identifies the right data domains.
+
+name: classify-invoices
+mode: step1
+description: >
+  Test file classification on Italian freelance invoices and meeting notes.
+  Verifies project matching and domain identification.
+
+directory: sample_files/invoices
+data_types: [tasks, notes, timelines]
+file_extensions: [txt, md]
+
+# ── Step-1 prompt variables ──────────────────────────────────────
+domain_definitions: |
+  - tasks: Action items, deliverables, things to do — anything that someone needs to complete.
+  - notes: Meeting summaries, decisions, reference information — permanent knowledge entries.
+  - timelines: Project milestones, deadlines, scheduled events — specific dates that mark a point in the progress of a project.
+
+projects_list:
+  - id: "proj-web-redesign"
+    name: "Redesign Sito Web Corporate"
+    status: "active"
+    aiSummary: "Corporate website redesign for Studio Architettura Bianchi"
+  - id: "proj-ecommerce"
+    name: "E-Commerce FashionStore"
+    status: "active"
+    aiSummary: "Next.js e-commerce platform for FashionStore srl"
+
+# ── Expected classification results ─────────────────────────────
+expected_classification:
+  - file: "sample_files/invoices/fattura_042.txt"
+    project_id: "proj-web-redesign"
+    domains: [tasks, notes, timelines]
+
+  - file: "sample_files/invoices/meeting_ecommerce.md"
+    project_id: "proj-ecommerce"
+    domains: [tasks, notes, timelines]
--- a/services/batch-agent/eval/fixtures/freelance_invoices.yaml
+++ b/services/batch-agent/eval/fixtures/freelance_invoices.yaml
@@ -1,86 +0,0 @@
-# Fixture: freelance-invoices
-# Tests extraction of tasks, notes, and timelines from
-# invoices and meeting notes typical of a freelance workflow.
-
-name: freelance-invoices
-description: >
-  Extract tasks, notes, and timeline events from Italian freelance
-  invoices and meeting notes. Tests project matching, priority
-  mapping, and bilingual content handling.
-
-directory: sample_files/invoices
-data_types: [tasks, notes, timelines]
-file_extensions: [txt, md]
-
-# Pre-existing records in the "database"
-seed_records:
-  projects:
-    - id: "proj-web-redesign"
-      name: "Redesign Sito Web Corporate"
-      status: "active"
-      aiSummary: "Corporate website redesign for Studio Architettura Bianchi"
-    - id: "proj-ecommerce"
-      name: "E-Commerce FashionStore"
-      status: "active"
-      aiSummary: "Next.js e-commerce platform for FashionStore srl"
-  tasks: []
-  notes: []
-  timelines: []
-
-# Prompt variations to compare
-prompt_variants:
-  baseline: |
-    Extract action items as tasks and summaries as notes.
-    For timelines, extract any mentioned dates and deadlines.
-    Set isAiSuggested=1 on every record.
-
-  detailed_italian: |
-    Estrai i dati dai file come segue:
-    - TASK: ogni azione da fare, deliverable, o item con scadenza.
-      Mappa "URGENTE" o "ALTA PRIORITÀ" → priority: high.
-      Mappa "media priorità" → priority: medium.
-      Mappa "bassa priorità" → priority: low.
-      Se un item è marcato come "completato" o [x], impostalo status: done.
-      Altrimenti status: todo.
-    - NOTE: riassunti di meeting, decisioni prese, note tecniche.
-      Il titolo deve essere descrittivo. Il content deve includere tutti i dettagli.
-    - TIMELINE: date di scadenza, milestone, meeting futuri.
-      Formato data: timestamp Unix in millisecondi.
-    Imposta sempre isAiSuggested=1.
-
-  minimal: |
-    Extract only high-priority action items as tasks.
-    Ignore notes and timelines unless explicitly marked as important.
-    Set isAiSuggested=1.
-
-# Expected extractions (what the agent SHOULD produce)
-# Only key fields are specified — scorer uses fuzzy matching
-expected:
-  tasks:
-    - title: "Sviluppo frontend React"
-      priority: "high"
-      status: "todo"
-    - title: "Integrazione API backend"
-      priority: "medium"
-      status: "todo"
-    - title: "Testing cross-browser e fix bug responsive"
-      status: "todo"
-    - title: "Preparare wireframe homepage"
-      priority: "high"
-      status: "todo"
-    - title: "Setup progetto Next.js e configurare CI/CD"
-      priority: "medium"
-      status: "todo"
-    - title: "Ricerca plugin Stripe per gestione abbonamenti"
-      priority: "low"
-      status: "todo"
-
-  notes:
-    - title: "Meeting Kickoff Progetto E-Commerce"
-
-  timelines:
-    - title: "MVP E-Commerce pronto"
-    - title: "Meeting di revisione"
-
-# Models to test (can be overridden via CLI --models)
-models: []
--- a/services/batch-agent/eval/fixtures/full_invoices.yaml
+++ b/services/batch-agent/eval/fixtures/full_invoices.yaml
@@ -0,0 +1,108 @@
+# Fixture: full-invoices (full)
+# Tests both _STEP1_SYSTEM_PROMPT and _PROCESSING_SYSTEM_PROMPT in sequence
+# via run_local_agent(). Verifies end-to-end classification + extraction.
+
+name: full-invoices
+mode: full
+description: >
+  End-to-end test: classify Italian invoices/meeting notes into the
+  correct project, then extract tasks, notes, and timeline events.
+
+directory: sample_files/invoices
+data_types: [tasks, notes, timelines]
+file_extensions: [txt, md]
+
+# ── Step-1 prompt variables ──────────────────────────────────────
+domain_definitions: |
+  - tasks: Action items, deliverables, things to do — anything that someone needs to complete.
+  - notes: Meeting summaries, decisions, reference information — permanent knowledge entries.
+  - timelines: Project milestones, deadlines, scheduled events — specific dates that mark a point in the progress of a project.
+
+projects_list:
+  - id: "proj-web-redesign"
+    name: "Redesign Sito Web Corporate"
+    status: "active"
+    aiSummary: "Corporate website redesign for Studio Architettura Bianchi"
+  - id: "proj-ecommerce"
+    name: "E-Commerce FashionStore"
+    status: "active"
+    aiSummary: "Next.js e-commerce platform for FashionStore srl"
+
+# ── Step-2 prompt variables ──────────────────────────────────────
+existing_context: |
+  Existing tasks:
+    (none)
+
+  Existing notes:
+    (none)
+
+  Existing timelines:
+    (none)
+
+project_context: ""
+
+custom_prompt_section: |
+  User instructions:
+  Estrai i dati dai file come segue:
+  - TASK: ogni azione da fare, deliverable, o item con scadenza.
+    Mappa "URGENTE" o "ALTA PRIORITÀ" → priority: high.
+    Mappa "media priorità" → priority: medium.
+    Mappa "bassa priorità" → priority: low.
+    Se un item è marcato come "completato" o [x], impostalo status: done.
+    Altrimenti status: todo.
+  - NOTE: riassunti di meeting, decisioni prese, note tecniche.
+  - TIMELINE: date di scadenza, milestone, meeting futuri.
+  Imposta sempre isAiSuggested=1.
+
+# ── Seed records (pre-existing DB state) ─────────────────────────
+seed_records:
+  projects:
+    - id: "proj-web-redesign"
+      name: "Redesign Sito Web Corporate"
+      status: "active"
+      aiSummary: "Corporate website redesign for Studio Architettura Bianchi"
+    - id: "proj-ecommerce"
+      name: "E-Commerce FashionStore"
+      status: "active"
+      aiSummary: "Next.js e-commerce platform for FashionStore srl"
+  tasks: []
+  notes: []
+  timelines: []
+
+# ── Expected classification (step 1) ─────────────────────────────
+expected_classification:
+  - file: "sample_files/invoices/fattura_042.txt"
+    project_id: "proj-web-redesign"
+    domains: [tasks, notes, timelines]
+
+  - file: "sample_files/invoices/meeting_ecommerce.md"
+    project_id: "proj-ecommerce"
+    domains: [tasks, notes, timelines]
+
+# ── Expected extractions (step 2) ────────────────────────────────
+expected:
+  tasks:
+    - title: "Sviluppo frontend React"
+      priority: "high"
+      status: "todo"
+    - title: "Integrazione API backend"
+      priority: "medium"
+      status: "todo"
+    - title: "Testing cross-browser e fix bug responsive"
+      status: "todo"
+    - title: "Preparare wireframe homepage"
+      priority: "high"
+      status: "todo"
+    - title: "Setup progetto Next.js e configurare CI/CD"
+      priority: "medium"
+      status: "todo"
+    - title: "Ricerca plugin Stripe per gestione abbonamenti"
+      priority: "low"
+      status: "todo"
+
+  notes:
+    - title: "Meeting Kickoff Progetto E-Commerce"
+
+  timelines:
+    - title: "MVP E-Commerce pronto"
+    - title: "Meeting di revisione"
--- a/services/batch-agent/eval/fixtures/journey_invoice_setup.yaml
+++ b/services/batch-agent/eval/fixtures/journey_invoice_setup.yaml
@@ -1,43 +1,25 @@
 # Journey Fixture: journey-invoice-setup
-# Tests that the journey chatbot correctly builds a prompt_template
-# for extracting tasks and notes from Italian invoices and meeting notes.
+# Used by `python -m eval interactive` for human-in-the-loop testing
+# of the journey chatbot's prompt-building conversation.

 type: journey
 name: journey-invoice-setup
 description: >
-  Test the journey chatbot's ability to explore a directory of Italian
-  invoices and meeting notes, ask relevant questions, and produce a
-  well-structured prompt_template for data extraction.
+  Interactive test for the journey chatbot — explore a directory of
+  Italian invoices and meeting notes, answer the chatbot's questions,
+  and verify it produces a well-structured prompt_template for data
+  extraction.

 directory: sample_files/invoices
-data_types: [tasks, notes, timelines]
-
-# Simulated user responses (the journey starts with the LLM exploring
-# the directory and asking its first question)
-user_messages:
-  - >
-    I want to extract action items from invoices and meeting notes.
-    The invoices are in Italian and contain work descriptions with
-    deadlines. Meeting notes have action items with checkboxes.
-  - >
-    Yes, map Italian priority keywords: "URGENTE" and "ALTA PRIORITÀ"
-    should be high priority, "media priorità" is medium, "bassa priorità"
-    is low. Items marked with [x] are already completed.
-  - >
-    For notes, I want meeting summaries with the full content including
-    decisions and attendees. For timelines, extract deadlines and
-    scheduled meeting dates.
-  - >
-    That's everything I need. Please generate the template.
+data_types: [tasks, notes, timelines, projects]

 # Criteria the generated prompt_template must satisfy
 # Each is scored 0-1 by an LLM judge
 expected_template_criteria:
  - "Mentions creating tasks from action items and work descriptions"
-  - "Includes Italian priority keyword mapping (URGENTE→high, media priorità→medium, bassa priorità→low)"
-  - "Handles completed items marked with [x] as status done"
  - "Mentions creating notes from meeting summaries"
  - "Mentions extracting timeline events from deadlines and meeting dates"
+  - "Mentions creating projects from relevant information"
  - "Sets isAiSuggested=1 on all created records"
  - "Does NOT include projectId assignment logic"
  - "Uses camelCase field names (title, status, priority, dueDate, content)"
--- a/services/batch-agent/eval/fixtures/process_invoices.yaml
+++ b/services/batch-agent/eval/fixtures/process_invoices.yaml
@@ -0,0 +1,81 @@
+# Fixture: process-invoices (step2)
+# Tests _PROCESSING_SYSTEM_PROMPT — data extraction & tool calling.
+# The classification step is skipped; prompt variables are injected directly.
+
+name: process-invoices
+mode: step2
+description: >
+  Test data extraction from Italian freelance invoices.
+  Verifies correct record creation via tool calls with the right
+  fields, priorities, and status values.
+
+directory: sample_files/invoices
+data_types: [tasks, notes, timelines]
+file_extensions: [txt, md]
+
+# ── Step-2 prompt variables ──────────────────────────────────────
+existing_context: |
+  Existing tasks:
+    (none)
+
+  Existing notes:
+    (none)
+
+  Existing timelines:
+    (none)
+
+project_context: >
+  Project: Redesign Sito Web Corporate (id: proj-web-redesign).
+  Always set projectId to this id on every record you create.
+
+custom_prompt_section: |
+  User instructions:
+  Estrai i dati dai file come segue:
+  - TASK: ogni azione da fare, deliverable, o item con scadenza.
+    Mappa "URGENTE" o "ALTA PRIORITÀ" → priority: high.
+    Mappa "media priorità" → priority: medium.
+    Mappa "bassa priorità" → priority: low.
+    Se un item è marcato come "completato" o [x], impostalo status: done.
+    Altrimenti status: todo.
+  - NOTE: riassunti di meeting, decisioni prese, note tecniche.
+    Il titolo deve essere descrittivo. Il content deve includere tutti i dettagli.
+  - TIMELINE: date di scadenza, milestone, meeting futuri.
+  Imposta sempre isAiSuggested=1.
+
+# ── Seed records (pre-existing DB state) ─────────────────────────
+seed_records:
+  projects:
+    - id: "proj-web-redesign"
+      name: "Redesign Sito Web Corporate"
+      status: "active"
+  tasks: []
+  notes: []
+  timelines: []
+
+# ── Expected extractions ─────────────────────────────────────────
+expected:
+  tasks:
+    - title: "Sviluppo frontend React"
+      priority: "high"
+      status: "todo"
+    - title: "Integrazione API backend"
+      priority: "medium"
+      status: "todo"
+    - title: "Testing cross-browser e fix bug responsive"
+      status: "todo"
+    - title: "Preparare wireframe homepage"
+      priority: "high"
+      status: "todo"
+    - title: "Setup progetto Next.js e configurare CI/CD"
+      priority: "medium"
+      status: "todo"
+    - title: "Ricerca plugin Stripe per gestione abbonamenti"
+      priority: "low"
+      status: "todo"
+
+  notes:
+    - title: "Meeting Kickoff Progetto E-Commerce"
+
+  timelines:
+    - title: "MVP E-Commerce pronto"
+    - title: "Meeting di revisione"
--- a/services/batch-agent/eval/interactive.py
+++ b/services/batch-agent/eval/interactive.py
@@ -0,0 +1,471 @@
+"""Interactive journey session — human-in-the-loop CLI conversation.
+
+Flow:
+1. Show the system prompt used by the journey AI.
+2. Start the journey (AI explores files, asks first question).
+3. User types responses in the terminal — AI replies.
+4. User types `/done` to end the conversation.
+5. User writes a comment about the interaction quality.
+6. LLM judge scores the conversation + generated template.
+7. Results are reported to Langfuse.
+
+Usage::
+
+    python -m eval interactive                        # pick a fixture interactively
+    python -m eval interactive --fixture=journey-invoice-setup
+    python -m eval interactive --model=gpt-4o
+    python -m eval interactive --judge-model=github_copilot/gpt-4o-mini
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import sys
+import time
+import uuid
+from dataclasses import dataclass, field
+from typing import Any
+
+from langchain_core.messages import HumanMessage, SystemMessage
+
+from eval.config import JourneyFixture, discover_journey_fixtures
+from eval.mock_executor import MockExecutor
+from eval import langfuse_eval
+
+logger = logging.getLogger(__name__)
+
+# ── Special commands ─────────────────────────────────────────────────────
+
+_CMD_DONE = "/done"
+_CMD_QUIT = "/quit"
+_CMD_TEMPLATE = "/template"
+_CMD_HELP = "/help"
+
+_HELP_TEXT = f"""\
+  {_CMD_DONE}       — End the conversation and proceed to evaluation
+  {_CMD_QUIT}       — Abort without evaluation
+  {_CMD_TEMPLATE}   — Show the generated template (if any)
+  {_CMD_HELP}       — Show this help"""
+
+# ── Terminal colours (ANSI) ──────────────────────────────────────────────
+
+_C_RESET = "\033[0m"
+_C_BOLD = "\033[1m"
+_C_DIM = "\033[2m"
+_C_CYAN = "\033[36m"
+_C_GREEN = "\033[32m"
+_C_YELLOW = "\033[33m"
+_C_MAGENTA = "\033[35m"
+_C_RED = "\033[31m"
+_C_BLUE = "\033[34m"
+
+
+def _print_header(text: str) -> None:
+    print(f"\n{_C_BOLD}{_C_CYAN}{'═' * 80}")
+    print(f"  {text}")
+    print(f"{'═' * 80}{_C_RESET}\n")
+
+
+def _print_ai(text: str) -> None:
+    print(f"\n{_C_GREEN}{_C_BOLD}AI:{_C_RESET} {text}\n")
+
+
+def _print_system(text: str) -> None:
+    print(f"{_C_DIM}{text}{_C_RESET}")
+
+
+def _print_score(label: str, score: float) -> None:
+    if score >= 0.7:
+        color = _C_GREEN
+        tag = "PASS"
+    elif score >= 0.4:
+        color = _C_YELLOW
+        tag = "PARTIAL"
+    else:
+        color = _C_RED
+        tag = "FAIL"
+    print(f"  {color}{tag:>7}{_C_RESET} ({score:.1f}) {label}")
+
+
+# ── Result type ──────────────────────────────────────────────────────────
+
+
+@dataclass
+class InteractiveResult:
+    fixture_name: str
+    model: str
+    judge_model: str
+    prompt_template: str | None
+    conversation: list[dict[str, str]]
+    user_comment: str
+    done: bool
+    criteria_scores: dict[str, float]
+    overall_score: float
+    judge_reasoning: str
+    elapsed_seconds: float
+
+    def summary(self) -> dict[str, Any]:
+        return {
+            "fixture": self.fixture_name,
+            "model": self.model,
+            "judge_model": self.judge_model,
+            "done": self.done,
+            "turns": len([c for c in self.conversation if c["role"] == "user"]),
+            "overall_score": round(self.overall_score, 3),
+            "user_comment": self.user_comment,
+            "criteria_scores": {k: round(v, 3) for k, v in self.criteria_scores.items()},
+            "elapsed_s": round(self.elapsed_seconds, 1),
+        }
+
+
+# ── LLM judge ────────────────────────────────────────────────────────────
+
+_INTERACTIVE_JUDGE_SYSTEM = """\
+You are an evaluation judge for AI-generated prompt templates produced during
+an interactive conversation between a human and a journey chatbot.
+
+The chatbot explored a directory and through multi-turn conversation with the
+user produced a prompt_template — an instruction set for a data-extraction agent.
+
+You have access to:
+- The full conversation transcript
+- The generated prompt_template (if any)
+- The user's own comment about the interaction
+- A list of quality criteria
+
+Score each criterion from 0 to 1:
+  - 1.0: Fully satisfied
+  - 0.5: Partially satisfied
+  - 0.0: Not satisfied
+
+Also provide an overall_quality score (0-1) evaluating the conversation flow,
+how well the AI understood the user, and the template quality.
+
+Respond with ONLY a JSON object:
+{
+  "criteria_scores": {"criterion_1": 0.8, ...},
+  "overall_quality": 0.85,
+  "reasoning": "Brief explanation covering both conversation quality and template accuracy"
+}
+"""
+
+
+async def _judge_interactive(
+    conversation: list[dict[str, str]],
+    prompt_template: str | None,
+    user_comment: str,
+    criteria: list[str],
+    *,
+    judge_model: str = "gpt-4o-mini",
+) -> tuple[dict[str, float], float, str]:
+    """Score an interactive session. Returns (criteria_scores, overall_quality, reasoning)."""
+    from shared.llm import get_llm
+
+    llm = get_llm(model=judge_model, temperature=0)
+
+    conv_text = "\n".join(
+        f"{'USER' if t['role'] == 'user' else 'AI'}: {t['content']}"
+        for t in conversation
+    )
+    criteria_text = "\n".join(f"  {i+1}. {c}" for i, c in enumerate(criteria))
+
+    user_content = (
+        f"## Conversation transcript\n```\n{conv_text}\n```\n\n"
+        f"## Generated prompt_template\n```\n{prompt_template or '(none — conversation did not complete)'}\n```\n\n"
+        f"## User's comment\n{user_comment}\n\n"
+        f"## Criteria to evaluate\n{criteria_text}"
+    )
+
+    try:
+        response = await llm.ainvoke([
+            SystemMessage(content=_INTERACTIVE_JUDGE_SYSTEM),
+            HumanMessage(content=user_content),
+        ])
+        raw = response.content.strip()
+        if raw.startswith("```"):
+            raw = raw.split("```")[1]
+            if raw.startswith("json"):
+                raw = raw[4:]
+        parsed = json.loads(raw.strip())
+
+        scores_raw = parsed.get("criteria_scores", parsed.get("scores", {}))
+        criteria_scores: dict[str, float] = {}
+        for i, criterion in enumerate(criteria):
+            key_candidates = [f"criterion_{i+1}", criterion, criterion[:50], str(i + 1)]
+            score = 0.0
+            for key in key_candidates:
+                if key in scores_raw:
+                    score = float(scores_raw[key])
+                    break
+            if score == 0.0 and i < len(scores_raw):
+                score = float(list(scores_raw.values())[i])
+            criteria_scores[criterion] = score
+
+        overall = float(parsed.get("overall_quality", 0.0))
+        reasoning = str(parsed.get("reasoning", ""))
+        return criteria_scores, overall, reasoning
+
+    except Exception as exc:
+        logger.warning("interactive judge failed: %s", exc)
+        return {c: 0.0 for c in criteria}, 0.0, f"Judge error: {exc}"
+
+
+# ── Interactive session ──────────────────────────────────────────────────
+
+
+async def run_interactive(
+    fixture: JourneyFixture,
+    *,
+    model: str = "gpt-4o",
+    judge_model: str = "gpt-4o-mini",
+    data_dir: Path | None = None,
+) -> InteractiveResult:
+    """Run an interactive journey session in the terminal.
+
+    Parameters
+    ----------
+    data_dir :
+        If set, overrides the fixture's sample-file directory.  The LLM
+        will explore this folder instead of the default
+        ``fixtures/sample_files/…``.  Useful for private test data that
+        shouldn't be committed to git.
+    """
+    from shared.config import settings
+    from shared.ws_context import set_current_user, clear_current_user
+    from app.journey import (
+        handle_journey_start,
+        handle_journey_message,
+        _build_system_prompt,
+    )
+
+    # When --data-dir is given, the MockExecutor's root becomes
+    # data_dir's parent and the journey directory is data_dir's name.
+    # This way the LLM sees a meaningful directory name (not ".") and
+    # MockExecutor resolves paths correctly.
+    # Otherwise, use the fixture's YAML parent and its relative path.
+    if data_dir:
+        mock_root = data_dir.parent
+        journey_directory = data_dir.name
+    else:
+        mock_root = fixture.fixture_path.parent
+        journey_directory = fixture.directory
+
+    mock = MockExecutor(
+        fixture_dir=mock_root,
+        seed_records={},
+    )
+
+    original_model = settings.LLM_MODEL
+    settings.LLM_MODEL = model
+    eval_user_id = f"interactive-{uuid.uuid4().hex[:8]}"
+
+    # ── Show system prompt ───────────────────────────────────────
+    system_prompt = _build_system_prompt(journey_directory, fixture.data_types)
+
+    _print_header("SYSTEM PROMPT")
+    print(f"{_C_DIM}{system_prompt}{_C_RESET}")
+
+    _print_header(f"INTERACTIVE JOURNEY  |  fixture: {fixture.name}  |  model: {model}")
+    print(f"  Data dir: {mock_root}")
+    print(f"  Type your responses. Commands: {_CMD_DONE}, {_CMD_QUIT}, {_CMD_TEMPLATE}, {_CMD_HELP}")
+    print(f"  Judge model: {judge_model}")
+    print(f"  Criteria: {len(fixture.expected_template_criteria)}")
+    print()
+
+    conversation: list[dict[str, str]] = []
+    prompt_template: str | None = None
+    done = False
+    start_time = time.time()
+
+    try:
+        set_current_user(eval_user_id)
+
+        with mock.patch():
+            # ── Start ────────────────────────────────────────────
+            _print_system("Starting journey... (AI is exploring your files)")
+
+            start_frame: dict[str, Any] = {
+                "agent_type": "local",
+                "directory": journey_directory,
+                "data_types": fixture.data_types,
+                "session_id": f"interactive-{uuid.uuid4().hex[:8]}",
+            }
+
+            reply = await handle_journey_start(eval_user_id, start_frame)
+            session_id = reply["session_id"]
+            conversation.append({"role": "assistant", "content": reply["message"]})
+            _print_ai(reply["message"])
+
+            if reply["done"]:
+                prompt_template = reply.get("prompt_template")
+                done = True
+                _print_system("Journey completed on first reply (template generated).")
+
+            # ── Conversation loop ────────────────────────────────
+            while not done:
+                try:
+                    user_input = input(f"{_C_BOLD}{_C_BLUE}YOU:{_C_RESET} ").strip()
+                except (EOFError, KeyboardInterrupt):
+                    print()
+                    user_input = _CMD_QUIT
+
+                if not user_input:
+                    continue
+
+                # Handle commands
+                if user_input.lower() == _CMD_QUIT:
+                    _print_system("Aborted — no evaluation will be performed.")
+                    settings.LLM_MODEL = original_model
+                    clear_current_user()
+                    return InteractiveResult(
+                        fixture_name=fixture.name, model=model, judge_model=judge_model,
+                        prompt_template=None, conversation=conversation,
+                        user_comment="(aborted)", done=False,
+                        criteria_scores={}, overall_score=0.0,
+                        judge_reasoning="Session aborted by user.",
+                        elapsed_seconds=time.time() - start_time,
+                    )
+
+                if user_input.lower() == _CMD_HELP:
+                    print(_HELP_TEXT)
+                    continue
+
+                if user_input.lower() == _CMD_TEMPLATE:
+                    if prompt_template:
+                        print(f"\n{_C_MAGENTA}{prompt_template}{_C_RESET}\n")
+                    else:
+                        _print_system("No template generated yet.")
+                    continue
+
+                if user_input.lower() == _CMD_DONE:
+                    _print_system("Ending conversation...")
+                    break
+
+                # ── Send message to AI ───────────────────────────
+                conversation.append({"role": "user", "content": user_input})
+                _print_system("AI is thinking...")
+
+                msg_frame: dict[str, Any] = {
+                    "session_id": session_id,
+                    "message": user_input,
+                }
+                reply = await handle_journey_message(eval_user_id, msg_frame)
+                conversation.append({"role": "assistant", "content": reply["message"]})
+                _print_ai(reply["message"])
+
+                if reply["done"]:
+                    prompt_template = reply.get("prompt_template")
+                    done = True
+                    _print_system("Journey completed — template generated!")
+
+    except Exception as exc:
+        logger.error("interactive journey failed: %s", exc)
+        _print_system(f"Error: {exc}")
+    finally:
+        settings.LLM_MODEL = original_model
+        clear_current_user()
+
+    elapsed = time.time() - start_time
+    turns = len([c for c in conversation if c["role"] == "user"])
+
+    # ── Show template if generated ───────────────────────────────
+    if prompt_template:
+        _print_header("GENERATED TEMPLATE")
+        print(f"{_C_MAGENTA}{prompt_template}{_C_RESET}\n")
+    else:
+        _print_system("No template was generated during this session.")
+
+    # ── User comment ─────────────────────────────────────────────
+    _print_header("YOUR EVALUATION")
+    print("  Write your comment about this interaction (press Enter twice to finish):")
+    print()
+    comment_lines: list[str] = []
+    try:
+        while True:
+            line = input()
+            if line == "" and comment_lines and comment_lines[-1] == "":
+                comment_lines.pop()  # remove trailing empty
+                break
+            comment_lines.append(line)
+    except (EOFError, KeyboardInterrupt):
+        pass
+    user_comment = "\n".join(comment_lines).strip() or "(no comment)"
+
+    # ── Judge ────────────────────────────────────────────────────
+    _print_header("LLM JUDGE EVALUATION")
+    _print_system(f"Scoring with {judge_model}...")
+
+    criteria_scores, overall_quality, judge_reasoning = await _judge_interactive(
+        conversation=conversation,
+        prompt_template=prompt_template,
+        user_comment=user_comment,
+        criteria=fixture.expected_template_criteria,
+        judge_model=judge_model,
+    )
+
+    # ── Display scores ───────────────────────────────────────────
+    print()
+    for criterion, score in criteria_scores.items():
+        _print_score(criterion, score)
+
+    overall = (
+        sum(criteria_scores.values()) / len(criteria_scores)
+        if criteria_scores
+        else 0.0
+    )
+
+    print(f"\n  {_C_BOLD}Criteria avg:      {overall:.2f}{_C_RESET}")
+    print(f"  {_C_BOLD}Overall quality:   {overall_quality:.2f}{_C_RESET}")
+    print(f"  {_C_BOLD}Turns:             {turns}{_C_RESET}")
+    print(f"  {_C_BOLD}Time:              {elapsed:.1f}s{_C_RESET}")
+    print(f"\n  {_C_DIM}Judge: {judge_reasoning}{_C_RESET}")
+    print(f"  {_C_DIM}Your comment: {user_comment}{_C_RESET}\n")
+
+    result = InteractiveResult(
+        fixture_name=fixture.name,
+        model=model,
+        judge_model=judge_model,
+        prompt_template=prompt_template,
+        conversation=conversation,
+        user_comment=user_comment,
+        done=done,
+        criteria_scores=criteria_scores,
+        overall_score=overall_quality,
+        judge_reasoning=judge_reasoning,
+        elapsed_seconds=elapsed,
+    )
+
+    # ── Report to Langfuse ───────────────────────────────────────
+    trace_id = langfuse_eval.log_eval_trace(
+        fixture_name=fixture.name,
+        model=model,
+        prompt_variant="interactive",
+        prompt_template=prompt_template or "(not generated)",
+        actual_mutations=[{
+            "conversation": conversation[:30],
+            "user_comment": user_comment,
+        }],
+        scores_summary=result.summary(),
+        langfuse_prompt_names=["journey_system"],
+    )
+
+    if trace_id:
+        from eval.scorer import EvalScores
+        scores_obj = EvalScores(
+            fixture_name=fixture.name,
+            model=model,
+            prompt_variant="interactive",
+            precision=overall,
+            recall=float(done),
+            f1=overall,
+            llm_judge_score=overall_quality,
+            llm_judge_reasoning=judge_reasoning,
+        )
+        langfuse_eval.post_eval_scores(scores_obj, trace_id=trace_id)
+        _print_system(f"Results reported to Langfuse (trace: {trace_id})")
+    else:
+        _print_system("Langfuse not configured — results not reported.")
+
+    return result
--- a/services/batch-agent/eval/journey_runner.py
+++ b/services/batch-agent/eval/journey_runner.py
@@ -94,7 +94,7 @@ async def _judge_template(

    Returns (criteria_scores, reasoning).
    """
-    from app.llm import get_llm
+    from shared.llm import get_llm

    llm = get_llm(model=judge_model, temperature=0)

@@ -152,13 +152,23 @@ async def run_single_journey_eval(
    model: str,
    *,
    judge_model: str = "gpt-4o-mini",
+    data_dir: Path | None = None,
 ) -> JourneyEvalResult:
-    """Execute one journey eval: start → messages → score template."""
+    """Execute one journey eval: start \u2192 messages \u2192 score template."""
    from shared.config import settings

-    # Build mock executor for filesystem tools
+    # When data_dir is given, use its parent as MockExecutor root
+    # and its name as the journey directory so the LLM sees a
+    # meaningful path (not ".").
+    if data_dir:
+        mock_root = data_dir.parent
+        journey_directory = data_dir.name
+    else:
+        mock_root = fixture.fixture_path.parent
+        journey_directory = fixture.directory
+
    mock = MockExecutor(
-        fixture_dir=fixture.fixture_dir,
+        fixture_dir=mock_root,
        seed_records={},
    )

@@ -178,7 +188,7 @@ async def run_single_journey_eval(
    done = False

    try:
-        from app.ws_context import set_current_user, clear_current_user
+        from shared.ws_context import set_current_user, clear_current_user
        from app.journey import handle_journey_start, handle_journey_message, _sessions

        set_current_user(eval_user_id)
@@ -186,7 +196,7 @@ async def run_single_journey_eval(
            # ── Start the journey ────────────────────────────────
            start_frame: dict[str, Any] = {
                "agent_type": "local",
-                "directory": fixture.directory,
+                "directory": journey_directory,
                "data_types": fixture.data_types,
                "session_id": f"eval-{uuid.uuid4().hex[:8]}",
            }
@@ -246,7 +256,7 @@ async def run_single_journey_eval(
        logger.error("journey_eval: pipeline failed for %s/%s: %s", fixture.name, model, exc)
    finally:
        settings.LLM_MODEL = original_model
-        from app.ws_context import clear_current_user
+        from shared.ws_context import clear_current_user
        clear_current_user()

    elapsed = time.time() - start_time
@@ -297,6 +307,7 @@ async def run_single_journey_eval(
        prompt_template=prompt_template or "(not generated)",
        actual_mutations=[{"conversation": conversation[:20]}],
        scores_summary=result.summary(),
+        langfuse_prompt_names=["journey_system"],
    )

    if trace_id:
@@ -321,6 +332,7 @@ async def run_journey_fixture_eval(
    models: list[str],
    *,
    judge_model: str = "gpt-4o-mini",
+    data_dir: Path | None = None,
 ) -> list[JourneyEvalResult]:
    """Run all models for a journey fixture."""
    langfuse_eval.sync_journey_fixture_to_dataset(fixture)
@@ -329,6 +341,7 @@ async def run_journey_fixture_eval(
    for model in models:
        result = await run_single_journey_eval(
            fixture, model, judge_model=judge_model,
+            data_dir=data_dir,
        )
        results.append(result)

--- a/services/batch-agent/eval/langfuse_eval.py
+++ b/services/batch-agent/eval/langfuse_eval.py
@@ -1,21 +1,21 @@
 """Langfuse evaluation integration — datasets, runs, and scoring.

-Uses the Langfuse Python SDK to:
+Uses the Langfuse Python SDK v4 (OpenTelemetry-based) to:

 1. **Sync fixtures → Langfuse datasets**: Each YAML fixture becomes a dataset,
   each prompt variant + expected pair becomes a dataset item.

 2. **Track eval runs**: Each (fixture × model × prompt_variant) execution
-   is recorded as a dataset run with linked traces and scores.
+   is recorded as a trace with linked scores.

 3. **Post scores**: precision, recall, F1, field_accuracy, llm_judge are
-   posted as numeric scores on the trace/run.
+   posted as numeric scores on the trace.
 """

 from __future__ import annotations

-import json
 import logging
+import os
 from typing import Any

 from shared.config import settings
@@ -26,16 +26,16 @@ logger = logging.getLogger(__name__)


 def _get_langfuse():
-    """Get or create a Langfuse client instance."""
+    """Get or create a Langfuse client instance (SDK v4)."""
    if not settings.LANGFUSE_SECRET_KEY or not settings.LANGFUSE_PUBLIC_KEY:
        return None
    try:
-        from langfuse import Langfuse
-        return Langfuse(
-            secret_key=settings.LANGFUSE_SECRET_KEY,
-            public_key=settings.LANGFUSE_PUBLIC_KEY,
-            host=settings.LANGFUSE_HOST,
-        )
+        os.environ.setdefault("LANGFUSE_SECRET_KEY", settings.LANGFUSE_SECRET_KEY)
+        os.environ.setdefault("LANGFUSE_PUBLIC_KEY", settings.LANGFUSE_PUBLIC_KEY)
+        if settings.LANGFUSE_HOST:
+            os.environ.setdefault("LANGFUSE_HOST", settings.LANGFUSE_HOST)
+        from langfuse import get_client
+        return get_client()
    except Exception as exc:
        logger.warning("langfuse_eval: failed to create client: %s", exc)
        return None
@@ -61,35 +61,44 @@ def sync_fixture_to_dataset(fixture: EvalFixture) -> str | None:
        lf.create_dataset(
            name=dataset_name,
            description=fixture.description,
-            metadata={"data_types": fixture.data_types, "file_extensions": fixture.file_extensions},
+            metadata={
+                "data_types": ",".join(fixture.data_types),
+                "file_extensions": ",".join(fixture.file_extensions) if fixture.file_extensions else "",
+            },
        )
    except Exception:
        # Dataset may already exist — that's fine
        pass

-    expected_output = {}
-    for rec in fixture.expected:
-        expected_output.setdefault(rec.table, []).append(rec.fields)
+    # Build expected_output appropriate to the fixture's mode
+    expected_output: dict[str, Any] = {}
+    if fixture.mode in ("step1", "full") and fixture.expected_classification:
+        expected_output["classifications"] = [
+            {"file": ec.file, "project_id": ec.project_id, "domains": ec.domains}
+            for ec in fixture.expected_classification
+        ]
+    if fixture.mode in ("step2", "full") and fixture.expected:
+        for rec in fixture.expected:
+            expected_output.setdefault(rec.table, []).append(rec.fields)

-    for variant_name, prompt_template in fixture.prompt_variants.items():
-        item_id = f"{fixture.name}--{variant_name}"
-        try:
-            lf.create_dataset_item(
-                dataset_name=dataset_name,
-                id=item_id,
-                input={
-                    "directory": fixture.directory,
-                    "data_types": fixture.data_types,
-                    "prompt_template": prompt_template,
-                    "seed_records": fixture.seed_records,
-                },
-                expected_output=expected_output,
-                metadata={"prompt_variant": variant_name},
-            )
-        except Exception as exc:
-            logger.warning(
-                "langfuse_eval: failed to upsert dataset item %s: %s", item_id, exc
-            )
+    item_id = f"{fixture.name}--{fixture.mode}"
+    try:
+        lf.create_dataset_item(
+            dataset_name=dataset_name,
+            id=item_id,
+            input={
+                "directory": fixture.directory,
+                "data_types": fixture.data_types,
+                "mode": fixture.mode,
+                "seed_records": fixture.seed_records,
+            },
+            expected_output=expected_output,
+            metadata={"mode": fixture.mode},
+        )
+    except Exception as exc:
+        logger.warning(
+            "langfuse_eval: failed to upsert dataset item %s: %s", item_id, exc
+        )

    lf.flush()
    logger.info("langfuse_eval: synced fixture '%s' → dataset '%s'", fixture.name, dataset_name)
@@ -114,7 +123,7 @@ def sync_journey_fixture_to_dataset(fixture) -> str | None:
        lf.create_dataset(
            name=dataset_name,
            description=fixture.description,
-            metadata={"type": "journey", "data_types": fixture.data_types},
+            metadata={"type": "journey", "data_types": ",".join(fixture.data_types)},
        )
    except Exception:
        pass  # Dataset may already exist
@@ -148,18 +157,26 @@ def create_eval_run(
    *,
    metadata: dict[str, Any] | None = None,
 ) -> str:
-    """Create a dataset run in Langfuse. Returns the run name."""
+    """Create a dataset run in Langfuse. Returns the run name.
+
+    Note: In SDK v4, dataset runs are created implicitly via
+    dataset.run_experiment(). This function is kept for backwards
+    compatibility but may not create a run.
+    """
    lf = _get_langfuse()
    if lf is None:
        return run_name

    try:
-        lf.create_dataset_run(
-            dataset_name=dataset_name,
-            run_name=run_name,
-            metadata=metadata or {},
-        )
-        lf.flush()
+        if hasattr(lf, "create_dataset_run"):
+            lf.create_dataset_run(
+                dataset_name=dataset_name,
+                run_name=run_name,
+                metadata=metadata or {},
+            )
+            lf.flush()
+        else:
+            logger.debug("langfuse_eval: create_dataset_run not available in SDK v4")
    except Exception as exc:
        logger.warning("langfuse_eval: failed to create run %s: %s", run_name, exc)

@@ -185,21 +202,22 @@ def post_eval_scores(
        ("precision", scores.precision),
        ("recall", scores.recall),
        ("f1", scores.f1),
-        ("field_accuracy", scores.field_accuracy),
    ]
+    # Only post field_accuracy when there are field-level scores (step2/full)
+    if scores.field_scores:
+        score_data.append(("field_accuracy", scores.field_accuracy))
    if scores.llm_judge_score is not None:
        score_data.append(("llm_judge", scores.llm_judge_score))

    for name, value in score_data:
        try:
-            kwargs: dict[str, Any] = {
-                "name": name,
-                "value": value,
-                "comment": f"{scores.fixture_name} | {scores.model} | {scores.prompt_variant}",
-            }
-            if trace_id:
-                kwargs["trace_id"] = trace_id
-            lf.score(**kwargs)
+            lf.create_score(
+                name=name,
+                value=value,
+                trace_id=trace_id,
+                data_type="NUMERIC",
+                comment=f"{scores.fixture_name} | {scores.model} | {scores.prompt_variant}",
+            )
        except Exception as exc:
            logger.warning("langfuse_eval: failed to post score %s: %s", name, exc)

@@ -218,12 +236,20 @@ def log_eval_trace(
    prompt_template: str,
    actual_mutations: list[dict],
    scores_summary: dict[str, Any],
+    step1_results: list[dict] | None = None,
    dataset_name: str | None = None,
    run_name: str | None = None,
    dataset_item_id: str | None = None,
+    langfuse_prompt_names: list[str] | None = None,
 ) -> str | None:
    """Create a Langfuse trace for one eval execution and link it to a dataset run.

+    Uses SDK v4 observation API (traces are created implicitly by root spans).
+    ``langfuse_prompt_names`` can contain one or two prompt names to link
+    (e.g. ``["batch_file_classifier", "batch_processing"]`` for full mode).
+    Each prompt gets its own generation-type observation for per-version
+    metrics tracking.
+
    Returns the trace_id, or None if Langfuse is unavailable.
    """
    lf = _get_langfuse()
@@ -231,38 +257,71 @@ def log_eval_trace(
        return None

    try:
-        trace = lf.trace(
-            name=f"eval-{fixture_name}",
-            input={
-                "prompt_template": prompt_template,
-                "model": model,
-                "prompt_variant": prompt_variant,
-            },
-            output={
-                "mutations": actual_mutations[:50],
-                "scores": scores_summary,
-            },
+        from langfuse import propagate_attributes
+
+        # Fetch prompt objects for linking
+        prompt_objs: list[tuple[str, Any]] = []
+        for pname in (langfuse_prompt_names or []):
+            try:
+                obj = lf.get_prompt(name=pname, cache_ttl_seconds=300)
+                prompt_objs.append((pname, obj))
+                logger.info("langfuse_eval: linked prompt '%s' (type=%s)", pname, type(obj).__name__)
+            except Exception as exc:
+                logger.warning("langfuse_eval: prompt '%s' not found — %s", pname, exc)
+
+        # Build trace output dict
+        trace_output: dict[str, Any] = {"scores": scores_summary}
+        if step1_results:
+            trace_output["classifications"] = step1_results
+        if actual_mutations:
+            trace_output["mutations"] = actual_mutations[:50]
+
+        with propagate_attributes(
+            trace_name=f"eval-{fixture_name}",
            metadata={
-                "eval": True,
+                "eval": "true",
                "fixture": fixture_name,
                "model": model,
                "prompt_variant": prompt_variant,
            },
            tags=["eval", f"model:{model}", f"variant:{prompt_variant}"],
-        )
+        ):
+            # Root span for the eval run
+            span = lf.start_observation(name=f"eval-{fixture_name}")
+            span.update(
+                input={
+                    "prompt_template": prompt_template,
+                    "model": model,
+                    "prompt_variant": prompt_variant,
+                },
+                output=trace_output,
+            )
+            trace_id = span.trace_id

-        # Link to dataset run if available
-        if dataset_name and run_name and dataset_item_id:
-            try:
-                dataset = lf.get_dataset(dataset_name)
-                item = dataset.get_item(dataset_item_id)
-                if item:
-                    item.link(trace, run_name)
-            except Exception as exc:
-                logger.warning("langfuse_eval: failed to link trace to dataset run: %s", exc)
+            # Create a generation-type observation per linked prompt
+            for pname, pobj in prompt_objs:
+                gen = lf.start_observation(
+                    name=f"prompt-{pname}",
+                    prompt=pobj,
+                    as_type="generation",
+                )
+                gen.end()
+
+            # Link to dataset run if available
+            if dataset_name and run_name and dataset_item_id:
+                try:
+                    dataset = lf.get_dataset(dataset_name)
+                    for item in dataset.items:
+                        if item.id == dataset_item_id:
+                            item.link(span, run_name)
+                            break
+                except Exception as exc:
+                    logger.warning("langfuse_eval: failed to link trace to dataset run: %s", exc)
+
+            span.end()

        lf.flush()
-        return trace.id
+        return trace_id
    except Exception as exc:
        logger.warning("langfuse_eval: failed to create eval trace: %s", exc)
        return None
--- a/services/batch-agent/eval/mock_executor.py
+++ b/services/batch-agent/eval/mock_executor.py
@@ -1,6 +1,6 @@
 """Mock executor — intercepts execute_on_client for offline E2E testing.

-Patches ``app.ws_context.execute_on_client`` so agent pipeline runs don't
+Patches ``execute_on_client`` at all usage sites so agent pipeline runs don't
 require a live Electron client or Redis.  Instead:

 - **Filesystem actions** (list_directory, read_file_content, get_file_metadata)
@@ -20,6 +20,7 @@ import uuid
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
+from contextlib import contextmanager, asynccontextmanager
 from unittest.mock import AsyncMock, patch


@@ -33,6 +34,30 @@ class Mutation:
    timestamp: float = field(default_factory=time.time)


+# ── Fake DB helpers (used to bypass async_session in full mode) ───────
+
+class _FakeRow:
+    """Mimics an AgentRunLog row returned by SQLAlchemy."""
+    id = 0
+    status = "running"
+    items_processed = 0
+    items_created = 0
+    errors: list[str] = []
+    completed_at = None
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        object.__setattr__(self, name, value)
+
+
+class _FakeResult:
+    """Mimics a SQLAlchemy ``Result`` with ``scalar_one_or_none``."""
+    def __init__(self, row: _FakeRow) -> None:
+        self._row = row
+
+    def scalar_one_or_none(self) -> _FakeRow:
+        return self._row
+
+
@dataclass
 class MockExecutor:
    """In-memory executor that replaces Redis-based tool round-trip.
@@ -77,12 +102,37 @@ class MockExecutor:

    # ── Context manager for patching ──────────────────────────────

+    @contextmanager
    def patch(self):
-        """Return an async context-manager that patches execute_on_client."""
-        return patch(
-            "app.ws_context.execute_on_client",
-            new=AsyncMock(side_effect=self._handle),
-        )
+        """Patch execute_on_client and DB session at all usage sites."""
+        mock_fn = AsyncMock(side_effect=self._handle)
+        targets = [
+            "shared.ws_context.execute_on_client",
+            "app.agent_runner.execute_on_client",
+            "app.agents.filesystem_agent.execute_on_client",
+        ]
+
+        # Mock async_session so run_local_agent / _finalize_run skip real DB
+        fake_row = _FakeRow()
+        fake_db = AsyncMock()
+        fake_db.commit = AsyncMock()
+        fake_db.refresh = AsyncMock()
+        fake_db.execute = AsyncMock(return_value=_FakeResult(fake_row))
+        fake_db.add = lambda obj: None  # noqa: ARG005
+
+        @asynccontextmanager
+        async def _fake_session():
+            yield fake_db
+
+        patches = [patch(t, new=mock_fn) for t in targets]
+        patches.append(patch("app.agent_runner.async_session", _fake_session))
+        for p in patches:
+            p.start()
+        try:
+            yield mock_fn
+        finally:
+            for p in patches:
+                p.stop()

    # ── Internal dispatch ─────────────────────────────────────────

--- a/services/batch-agent/eval/runner.py
+++ b/services/batch-agent/eval/runner.py
@@ -1,28 +1,31 @@
 """Eval runner — orchestrates fixture → mock → agent pipeline → scoring.

-For each (fixture × model × prompt_variant) combination:
-1. Build a MockExecutor with fixture data
-2. Patch execute_on_client
-3. Override LLM_MODEL in shared settings
-4. Run the batch agent pipeline (run_local_agent)
-5. Collect mutations from the mock
-6. Score against expected results (field match + optional LLM judge)
-7. Report scores to Langfuse
-8. Print results
+Supports three eval modes:
+
+- **step1**: Test classification prompt only (``_STEP1_SYSTEM_PROMPT``).
+  Calls the LLM with fixture-provided ``domain_definitions`` and
+  ``projects_list`` and compares output against ``expected_classification``.
+
+- **step2**: Test processing prompt only (``_PROCESSING_SYSTEM_PROMPT``).
+  Compiles the prompt with fixture-provided ``existing_context``,
+  ``project_context``, ``data_types``, and ``custom_prompt_section``,
+  then runs the tool-calling loop.  Mutations are scored against
+  ``expected`` records.
+
+- **full**: Run ``run_local_agent()`` end-to-end (both steps).
+  Scored on both classification and extraction.
 """

 from __future__ import annotations

-import asyncio
 import copy
 import json
 import logging
 import time
 import uuid
-from pathlib import Path
 from typing import Any

-from eval.config import EvalFixture, ExpectedRecord
+from eval.config import EvalFixture, ExpectedClassification
 from eval.mock_executor import MockExecutor
 from eval.scorer import (
    EvalScores,
@@ -36,72 +39,193 @@ from eval import langfuse_eval
 logger = logging.getLogger(__name__)


-async def run_single_eval(
+# ── Step 1 runner ─────────────────────────────────────────────────────────
+
+
+async def _run_step1(
    fixture: EvalFixture,
    model: str,
-    prompt_variant: str,
-    *,
-    use_llm_judge: bool = True,
-    judge_model: str = "gpt-4o-mini",
-) -> EvalScores:
-    """Execute one (fixture × model × prompt_variant) eval and return scores."""
-    from shared.config import settings
+    mock: MockExecutor,
+) -> list[dict[str, Any]]:
+    """Run step-1 classification for each expected file.

-    prompt_template = fixture.prompt_variants.get(prompt_variant, "")
+    Returns a list of result dicts:
+    ``[{file, project_id, domains, new_project_name}, ...]``
+    """
+    from app.agent_runner import _classify_file

-    # Build mock executor
-    seed = copy.deepcopy(fixture.seed_records)
-    mock = MockExecutor(
-        fixture_dir=fixture.fixture_dir,
-        seed_records=seed,
+    results: list[dict[str, Any]] = []
+    for ec in fixture.expected_classification:
+        # Read the file content through the mock
+        file_result = await mock._handle(
+            action="read_file_content",
+            data={"path": ec.file},
+        )
+        file_content: str = file_result.get("content", "")
+
+        project_id, domains, new_name = await _classify_file(
+            file_path=ec.file,
+            file_content=file_content,
+            projects=fixture.projects_list,
+            config_data_types=fixture.data_types,
+        )
+        results.append({
+            "file": ec.file,
+            "project_id": project_id,
+            "domains": domains,
+            "new_project_name": new_name,
+        })
+    return results
+
+
+def _score_step1(
+    fixture: EvalFixture,
+    results: list[dict[str, Any]],
+) -> tuple[float, float, float, str]:
+    """Score step-1 results. Returns (precision, recall, f1, reasoning)."""
+    if not fixture.expected_classification:
+        return 0.0, 0.0, 0.0, "No expected classifications"
+
+    total = len(fixture.expected_classification)
+    matched = 0
+    details: list[str] = []
+
+    for ec in fixture.expected_classification:
+        actual = next((r for r in results if r["file"] == ec.file), None)
+        if actual is None:
+            details.append(f"  MISS {ec.file}: not processed")
+            continue
+
+        pid_ok = actual["project_id"] == ec.project_id
+        domains_ok = set(actual["domains"]) == set(ec.domains) if ec.domains else True
+
+        if pid_ok and domains_ok:
+            matched += 1
+            details.append(f"  OK   {ec.file}: project={actual['project_id']}, domains={actual['domains']}")
+        else:
+            parts: list[str] = []
+            if not pid_ok:
+                parts.append(f"project expected={ec.project_id} got={actual['project_id']}")
+            if not domains_ok:
+                parts.append(f"domains expected={ec.domains} got={actual['domains']}")
+            details.append(f"  FAIL {ec.file}: {'; '.join(parts)}")
+
+    precision = matched / total if total > 0 else 0.0
+    recall = precision  # in step1, precision == recall (same denominator)
+    f1 = precision  # same
+    reasoning = "\n".join(details)
+    return precision, recall, f1, reasoning
+
+
+# ── Step 2 runner ─────────────────────────────────────────────────────────
+
+
+async def _run_step2(
+    fixture: EvalFixture,
+    model: str,
+    mock: MockExecutor,
+) -> None:
+    """Run step-2 processing for each file in the fixture directory.
+
+    Compiles ``_PROCESSING_SYSTEM_PROMPT`` with fixture-provided variables
+    and runs the tool-calling loop.  Mutations are captured by the mock.
+    """
+    from app.agent_runner import (
+        _PROCESSING_SYSTEM_PROMPT,
+        _build_processing_tools,
+        _run_agent_with_tools,
+        _MAX_PROCESSING_STEPS,
+    )
+    from app import tracing
+
+    # Compile the processing prompt with fixture variables
+    system_prompt = tracing.compile_prompt(
+        "batch_processing",
+        fallback=_PROCESSING_SYSTEM_PROMPT,
+        variables={
+            "existing_context": fixture.existing_context,
+            "project_context": fixture.project_context,
+            "data_types": ", ".join(fixture.data_types),
+            "custom_prompt_section": fixture.custom_prompt_section,
+        },
    )

-    # Override the LLM model for this run
-    original_model = settings.LLM_MODEL
-    settings.LLM_MODEL = model
+    tools = _build_processing_tools(fixture.data_types)
+
+    # Scan files in the fixture directory
+    file_entries = await mock._handle(
+        action="list_directory",
+        data={"path": fixture.directory},
+    )
+    for entry in file_entries.get("entries", []):
+        if entry.get("type") != "file":
+            continue
+        # Filter by extension if specified
+        if fixture.file_extensions:
+            ext = entry["name"].rsplit(".", 1)[-1] if "." in entry["name"] else ""
+            if ext not in fixture.file_extensions:
+                continue
+
+        file_result = await mock._handle(
+            action="read_file_content",
+            data={"path": entry["path"]},
+        )
+        file_content: str = file_result.get("content", "")
+        if not file_content.strip():
+            continue
+
+        await _run_agent_with_tools(
+            system_prompt=system_prompt,
+            user_message=(
+                f"Process this file and extract relevant information.\n\n"
+                f"File: {entry['path']}\n\nContent:\n{file_content}"
+            ),
+            tools=tools,
+            max_steps=_MAX_PROCESSING_STEPS,
+        )
+
+
+# ── Full runner ───────────────────────────────────────────────────────────
+
+
+async def _run_full(
+    fixture: EvalFixture,
+    model: str,
+    mock: MockExecutor,
+    user_id: str,
+) -> None:
+    """Run the full two-step pipeline via ``run_local_agent``."""
+    from app.agent_runner import run_local_agent

-    # Build trigger data (same shape as what redis_consumer delivers)
    trigger_data: dict[str, Any] = {
        "type": "agent_trigger",
        "directory": fixture.directory,
        "directory_paths": [fixture.directory],
        "data_types": fixture.data_types,
        "file_extensions": fixture.file_extensions,
-        "prompt_template": prompt_template,
+        "prompt_template": fixture.custom_prompt_section,
        "device_id": "eval-harness",
        "run_context": {
-            "agent_id": f"eval-{fixture.name}-{prompt_variant}",
-            "run_id": None,  # skip DB logging during eval
+            "agent_id": f"eval-{fixture.name}",
+            "run_id": None,
        },
    }

-    eval_user_id = f"eval-{uuid.uuid4().hex[:8]}"
+    with mock.patch():
+        await run_local_agent(user_id, trigger_data)

-    logger.info(
-        "eval: starting %s | model=%s | variant=%s",
-        fixture.name, model, prompt_variant,
-    )
-    start_time = time.time()

-    try:
-        # Patch execute_on_client + set user context, then run the pipeline
-        from app.ws_context import set_current_user, clear_current_user
-        from app.agent_runner import run_local_agent
+# ── Scoring helpers ───────────────────────────────────────────────────────

-        set_current_user(eval_user_id)
-        with mock.patch():
-            await run_local_agent(eval_user_id, trigger_data)
-    except Exception as exc:
-        logger.error("eval: pipeline failed for %s/%s/%s: %s", fixture.name, model, prompt_variant, exc)
-    finally:
-        settings.LLM_MODEL = original_model
-        from app.ws_context import clear_current_user
-        clear_current_user()

-    elapsed = time.time() - start_time
-    logger.info("eval: pipeline completed in %.1fs — %d mutations", elapsed, len(mock.mutations))
+def _score_mutations(
+    fixture: EvalFixture,
+    mock: MockExecutor,
+) -> tuple[list[FieldScore], float, float, float, int, int]:
+    """Score mutations against expected records.

-    # ── Score results ────────────────────────────────────────────
+    Returns (field_scores, precision, recall, f1, extra, missing).
+    """
    all_field_scores: list[FieldScore] = []
    total_expected = 0
    total_actual = 0
@@ -109,12 +233,10 @@ async def run_single_eval(
    total_extra = 0
    total_missing = 0

-    # Group expected by table
    expected_by_table: dict[str, list[dict]] = {}
    for rec in fixture.expected:
        expected_by_table.setdefault(rec.table, []).append(rec.fields)

-    # Compare against actual mutations (inserts + updates)
    tables = set(expected_by_table.keys()) | {m.table for m in mock.mutations}
    for table in tables:
        expected_records = expected_by_table.get(table, [])
@@ -131,49 +253,160 @@ async def run_single_eval(
        total_missing += missing

    precision, recall, f1 = compute_precision_recall(total_expected, total_actual, total_matched)
+    return all_field_scores, precision, recall, f1, total_extra, total_missing

-    scores = EvalScores(
-        fixture_name=fixture.name,
-        model=model,
-        prompt_variant=prompt_variant,
-        field_scores=all_field_scores,
-        precision=precision,
-        recall=recall,
-        f1=f1,
-        extra_records=total_extra,
-        missing_records=total_missing,
+
+# ── Main entry point ──────────────────────────────────────────────────────
+
+
+async def run_single_eval(
+    fixture: EvalFixture,
+    model: str,
+    *,
+    use_llm_judge: bool = True,
+    judge_model: str = "gpt-4o-mini",
+) -> EvalScores:
+    """Execute one eval run for a fixture + model.  Mode is read from the fixture."""
+    from shared.config import settings
+    from shared.ws_context import set_current_user, clear_current_user
+
+    seed = copy.deepcopy(fixture.seed_records)
+    mock = MockExecutor(
+        fixture_dir=fixture.fixture_path.parent,
+        seed_records=seed,
    )

-    # ── Optional LLM judge ───────────────────────────────────────
-    if use_llm_judge and fixture.expected:
-        all_expected = [r.fields for r in fixture.expected]
-        all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")]
-        judge_score, reasoning = await llm_judge_score(
-            all_expected, all_actual, judge_model=judge_model,
-        )
-        scores.llm_judge_score = judge_score
-        scores.llm_judge_reasoning = reasoning
+    original_model = settings.LLM_MODEL
+    settings.LLM_MODEL = model
+    eval_user_id = str(uuid.uuid4())

-    # ── Report to Langfuse ───────────────────────────────────────
-    dataset_name = f"batch-eval-{fixture.name}"
-    dataset_item_id = f"{fixture.name}--{prompt_variant}"
-    run_name = f"{model}--{prompt_variant}--{int(time.time())}"
+    logger.info(
+        "eval: starting %s | mode=%s | model=%s",
+        fixture.name, fixture.mode, model,
+    )
+    start_time = time.time()
+
+    step1_results: list[dict[str, Any]] = []
+    step1_reasoning = ""
+
+    try:
+        set_current_user(eval_user_id)
+
+        if fixture.mode == "step1":
+            with mock.patch():
+                step1_results = await _run_step1(fixture, model, mock)
+
+        elif fixture.mode == "step2":
+            with mock.patch():
+                await _run_step2(fixture, model, mock)
+
+        elif fixture.mode == "full":
+            with mock.patch():
+                # Step 1 — classification (independent from run_local_agent)
+                if fixture.expected_classification:
+                    step1_results = await _run_step1(fixture, model, mock)
+
+            # Step 2 — full pipeline (run_local_agent handles both steps)
+            await _run_full(fixture, model, mock, eval_user_id)
+
+    except Exception as exc:
+        logger.error("eval: pipeline failed for %s/%s: %s", fixture.name, model, exc)
+    finally:
+        settings.LLM_MODEL = original_model
+        clear_current_user()
+
+    elapsed = time.time() - start_time
+    logger.info("eval: completed in %.1fs — %d mutations", elapsed, len(mock.mutations))
+
+    # ── Score ─────────────────────────────────────────────────────
+
+    if fixture.mode == "step1":
+        s1_precision, s1_recall, s1_f1, step1_reasoning = _score_step1(fixture, step1_results)
+        scores = EvalScores(
+            fixture_name=fixture.name,
+            model=model,
+            prompt_variant=fixture.mode,
+            precision=s1_precision,
+            recall=s1_recall,
+            f1=s1_f1,
+            llm_judge_reasoning=step1_reasoning,
+        )
+    else:
+        # step2 or full — score mutations
+        field_scores, precision, recall, f1, extra, missing = _score_mutations(fixture, mock)
+        scores = EvalScores(
+            fixture_name=fixture.name,
+            model=model,
+            prompt_variant=fixture.mode,
+            field_scores=field_scores,
+            precision=precision,
+            recall=recall,
+            f1=f1,
+            extra_records=extra,
+            missing_records=missing,
+        )
+
+        # Add step1 classification scores for full mode
+        if fixture.mode == "full" and fixture.expected_classification:
+            s1_p, s1_r, s1_f1, step1_reasoning = _score_step1(fixture, step1_results)
+            scores.llm_judge_reasoning = f"Step1 classification:\n{step1_reasoning}"
+
+        # Optional LLM judge for extraction quality
+        if use_llm_judge and fixture.expected:
+            all_expected = [r.fields for r in fixture.expected]
+            all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")]
+            judge_score, reasoning = await llm_judge_score(
+                all_expected, all_actual, judge_model=judge_model,
+            )
+            scores.llm_judge_score = judge_score
+            if step1_reasoning:
+                scores.llm_judge_reasoning += f"\n\nLLM judge:\n{reasoning}"
+            else:
+                scores.llm_judge_reasoning = reasoning
+
+    # ── Report to Langfuse ────────────────────────────────────────
+    prompt_names = {
+        "step1": ["batch_file_classifier"],
+        "step2": ["batch_processing"],
+        "full": ["batch_file_classifier", "batch_processing"],
+    }.get(fixture.mode, ["batch_processing"])

    trace_id = langfuse_eval.log_eval_trace(
        fixture_name=fixture.name,
        model=model,
-        prompt_variant=prompt_variant,
-        prompt_template=prompt_template,
+        prompt_variant=fixture.mode,
+        prompt_template=fixture.custom_prompt_section or "(default)",
        actual_mutations=[{"action": m.action, "table": m.table, "data": m.data} for m in mock.mutations],
        scores_summary=scores.summary(),
-        dataset_name=dataset_name,
-        run_name=run_name,
-        dataset_item_id=dataset_item_id,
+        step1_results=step1_results or None,
+        langfuse_prompt_names=prompt_names,
    )

    if trace_id:
        langfuse_eval.post_eval_scores(scores, trace_id=trace_id)

+        # For full mode, post classification scores separately
+        if fixture.mode == "full" and fixture.expected_classification:
+            s1_p, s1_r, s1_f1, _ = _score_step1(fixture, step1_results)
+            for name, value in [
+                ("classification_precision", s1_p),
+                ("classification_recall", s1_r),
+                ("classification_f1", s1_f1),
+            ]:
+                try:
+                    from langfuse import get_client
+                    lf = get_client()
+                    if lf:
+                        lf.create_score(
+                            name=name,
+                            value=value,
+                            trace_id=trace_id,
+                            data_type="NUMERIC",
+                            comment=f"{fixture.name} | {model} | full",
+                        )
+                except Exception:
+                    pass
+
    return scores


@@ -181,29 +414,20 @@ async def run_fixture_eval(
    fixture: EvalFixture,
    models: list[str],
    *,
-    variants: list[str] | None = None,
    use_llm_judge: bool = True,
    judge_model: str = "gpt-4o-mini",
 ) -> list[EvalScores]:
-    """Run all (model × variant) combinations for a fixture."""
-    if variants is None:
-        variants = list(fixture.prompt_variants.keys())
-
-    # Sync fixture to Langfuse dataset
+    """Run all models for a fixture."""
    langfuse_eval.sync_fixture_to_dataset(fixture)

    results: list[EvalScores] = []
    for model in models:
-        for variant in variants:
-            if variant not in fixture.prompt_variants:
-                logger.warning("eval: variant %r not found in fixture %s", variant, fixture.name)
-                continue
-            scores = await run_single_eval(
-                fixture, model, variant,
-                use_llm_judge=use_llm_judge,
-                judge_model=judge_model,
-            )
-            results.append(scores)
+        scores = await run_single_eval(
+            fixture, model,
+            use_llm_judge=use_llm_judge,
+            judge_model=judge_model,
+        )
+        results.append(scores)

    return results

@@ -214,18 +438,21 @@ def print_results(results: list[EvalScores]) -> None:
        print("\nNo eval results.")
        return

-    print("\n" + "=" * 90)
-    print(f"{'Fixture':<25} {'Model':<25} {'Variant':<15} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}")
-    print("-" * 90)
+    print("\n" + "=" * 95)
+    print(f"{'Fixture':<25} {'Mode':<6} {'Model':<25} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}")
+    print("-" * 95)

    for s in results:
        llm_str = f"{s.llm_judge_score:.2f}" if s.llm_judge_score is not None else "  --"
        print(
-            f"{s.fixture_name:<25} {s.model:<25} {s.prompt_variant:<15} "
+            f"{s.fixture_name:<25} {s.prompt_variant:<6} {s.model:<25} "
            f"{s.precision:>6.2f} {s.recall:>6.2f} {s.f1:>6.2f} "
            f"{s.field_accuracy:>6.2f} {llm_str:>6}"
        )

+    print("=" * 95)
+    print()
+
    print("=" * 90)

    # If LLM judge reasoning is available, print it
--- a/services/batch-agent/eval/scorer.py
+++ b/services/batch-agent/eval/scorer.py
@@ -242,7 +242,7 @@ async def llm_judge_score(

    Returns (score, reasoning).
    """
-    from app.llm import get_llm
+    from shared.llm import get_llm

    llm = get_llm(model=judge_model, temperature=0)