refactor(eval): 3-mode eval harness (step1/step2/full) with Langfuse fixes

- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants - Rewrite runner with _run_step1, _run_step2, _run_full dispatch - CLI: replace --variants with --mode flag - Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full) - Remove old freelance_invoices fixture - Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full) - Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode - Langfuse: post separate classification_precision/recall/f1 scores for full mode - Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1) - Langfuse: include step1_results in trace output - MockExecutor: mock async_session to bypass DB in full mode - Journey fixture: remove user_messages (only interactive test kept)
2026-03-24 16:18:51 +01:00
parent 63fa119543
commit d3f7099d93
13 changed files with 1409 additions and 439 deletions
--- a/services/batch-agent/eval/config.py
+++ b/services/batch-agent/eval/config.py
@@ -1,70 +1,16 @@
 """Eval configuration — YAML fixture loader and dataclasses.

-A *fixture* is a YAML file that defines a complete test scenario:
+Fixtures come in two families:

-.. code-block:: yaml
+1. **Agent fixtures** — test the batch agent pipeline.
+   Three modes controlled by ``mode``:

-    name: freelance-invoices
-    description: Extract tasks and notes from invoice PDFs (text layer)
-    directory: sample_files/invoices      # relative to fixture dir
-    data_types: [tasks, notes]
-    file_extensions: [txt, md]
+   ``step1``  — classification prompt only.
+   ``step2``  — processing prompt only.
+   ``full``   — both steps in sequence.

-    # Preseeded records the agent "sees" as existing data
-    seed_records:
-      projects:
-        - id: proj-1
-          name: "Website Redesign"
-          status: active
-      tasks: []
-
-    # Prompt variations to test (at least one required)
-    prompt_variants:
-      baseline: |
-        Extract action items as tasks and meeting summaries as notes.
-        Set priority based on urgency keywords.
-      detailed: |
-        Extract action items as tasks. Map "URGENT" to high priority,
-        "ASAP" to medium. Summaries become notes with full content.
-
-    # Expected extractions — what the agent SHOULD produce
-    expected:
-      tasks:
-        - title: "Send revised invoice to client"
-          priority: high
-          status: todo
-        - title: "Update project timeline"
-          priority: medium
-      notes:
-        - title: "Meeting summary - March kickoff"
-
-    # Optional: models to test (overrides CLI --models)
-    models: []
-
-A *journey fixture* tests the prompt-template builder conversation:
-
-.. code-block:: yaml
-
-    type: journey
-    name: journey-invoices
-    description: Test journey builds a good template for invoices
-    directory: sample_files/invoices
-    data_types: [tasks, notes]
-
-    # Simulated user responses for multi-turn conversation
-    user_messages:
-      - "I want to extract action items and meeting summaries"
-      - "Yes, map URGENTE to high priority"
-      - "That looks good, generate the template"
-
-    # Criteria the generated prompt_template should satisfy
-    expected_template_criteria:
-      - "mentions tasks and notes as target entities"
-      - "includes priority mapping rules"
-      - "references isAiSuggested=1"
-      - "does not mention projectId"
-
-    models: []
+2. **Journey fixtures** — test the prompt-template builder conversation
+   (unchanged).
 """

 from __future__ import annotations
@@ -72,12 +18,14 @@ from __future__ import annotations
 import logging
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any
+from typing import Any, Literal

 import yaml

 logger = logging.getLogger(__name__)

+EvalMode = Literal["step1", "step2", "full"]
+

@dataclass
 class ExpectedRecord:
@@ -90,21 +38,52 @@ class ExpectedRecord:
    fields: dict[str, Any]  # field_name → expected_value


+@dataclass
+class ExpectedClassification:
+    """Expected output of step-1 classification for one file."""
+
+    file: str  # relative path to the sample file
+    project_id: str  # expected matched project id, or "new"
+    domains: list[str]  # expected domain list
+    new_project_name: str | None = None
+
+
@dataclass
 class EvalFixture:
-    """A complete test scenario loaded from YAML."""
+    """A complete test scenario loaded from YAML.
+
+    ``mode`` determines which pipeline steps are exercised:
+
+    - **step1**: only ``_classify_file``
+    - **step2**: only the processing LLM + tool loop
+    - **full**: both steps in sequence (``run_local_agent``)
+    """

    name: str
    description: str
+    mode: EvalMode
    directory: str  # relative path to sample files
    data_types: list[str]
    file_extensions: list[str]
-    seed_records: dict[str, list[dict]]
-    prompt_variants: dict[str, str]  # variant_name → prompt_template
-    expected: list[ExpectedRecord]
    models: list[str]  # if empty, use CLI default
    fixture_path: Path = field(default_factory=lambda: Path("."))

+    # ── Step-1 inputs (classification) ───────────────────────────
+    domain_definitions: str = ""
+    projects_list: list[dict[str, Any]] = field(default_factory=list)
+
+    # ── Step-2 inputs (processing) ───────────────────────────────
+    existing_context: str = ""
+    project_context: str = ""
+    custom_prompt_section: str = ""
+
+    # ── Seed records for mock executor ───────────────────────────
+    seed_records: dict[str, list[dict]] = field(default_factory=dict)
+
+    # ── Expected outputs ─────────────────────────────────────────
+    expected_classification: list[ExpectedClassification] = field(default_factory=list)
+    expected: list[ExpectedRecord] = field(default_factory=list)
+
    @property
    def fixture_dir(self) -> Path:
        """Absolute path to the sample files directory."""
@@ -115,22 +94,44 @@ class EvalFixture:
        """Load a fixture from a YAML file."""
        raw = yaml.safe_load(path.read_text(encoding="utf-8"))

+        mode: EvalMode = raw.get("mode", "full")
+
+        # Parse expected records (step2/full)
        expected: list[ExpectedRecord] = []
        for table, records in (raw.get("expected") or {}).items():
            for rec in records:
                expected.append(ExpectedRecord(table=table, fields=rec))

+        # Parse expected classification (step1/full)
+        expected_classification: list[ExpectedClassification] = []
+        for item in raw.get("expected_classification") or []:
+            expected_classification.append(ExpectedClassification(
+                file=item["file"],
+                project_id=item["project_id"],
+                domains=item.get("domains", []),
+                new_project_name=item.get("new_project_name"),
+            ))
+
        return cls(
            name=raw["name"],
            description=raw.get("description", ""),
+            mode=mode,
            directory=raw.get("directory", "sample_files"),
            data_types=raw.get("data_types", ["tasks"]),
            file_extensions=raw.get("file_extensions", []),
-            seed_records=raw.get("seed_records", {}),
-            prompt_variants=raw.get("prompt_variants", {"default": ""}),
-            expected=expected,
            models=raw.get("models", []),
            fixture_path=path,
+            # Step-1 inputs
+            domain_definitions=raw.get("domain_definitions", ""),
+            projects_list=raw.get("projects_list", []),
+            # Step-2 inputs
+            existing_context=raw.get("existing_context", ""),
+            project_context=raw.get("project_context", ""),
+            custom_prompt_section=raw.get("custom_prompt_section", ""),
+            # Shared
+            seed_records=raw.get("seed_records", {}),
+            expected_classification=expected_classification,
+            expected=expected,
        )


@@ -168,9 +169,9 @@ class JourneyFixture:
    description: str
    directory: str  # relative path to sample files
    data_types: list[str]
-    user_messages: list[str]  # simulated user responses
    expected_template_criteria: list[str]  # what the template should contain/satisfy
-    models: list[str]
+    user_messages: list[str] = field(default_factory=list)  # for automated journey runs (unused in interactive mode)
+    models: list[str] = field(default_factory=list)
    fixture_path: Path = field(default_factory=lambda: Path("."))

    @property