refactor(eval): 3-mode eval harness (step1/step2/full) with Langfuse fixes

- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants
- Rewrite runner with _run_step1, _run_step2, _run_full dispatch
- CLI: replace --variants with --mode flag
- Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full)
- Remove old freelance_invoices fixture
- Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full)
- Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode
- Langfuse: post separate classification_precision/recall/f1 scores for full mode
- Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1)
- Langfuse: include step1_results in trace output
- MockExecutor: mock async_session to bypass DB in full mode
- Journey fixture: remove user_messages (only interactive test kept)
This commit is contained in:
Roberto Musso
2026-03-24 16:18:51 +01:00
parent 63fa119543
commit d3f7099d93
13 changed files with 1409 additions and 439 deletions

View File

@@ -1,70 +1,16 @@
"""Eval configuration — YAML fixture loader and dataclasses.
A *fixture* is a YAML file that defines a complete test scenario:
Fixtures come in two families:
.. code-block:: yaml
1. **Agent fixtures** — test the batch agent pipeline.
Three modes controlled by ``mode``:
name: freelance-invoices
description: Extract tasks and notes from invoice PDFs (text layer)
directory: sample_files/invoices # relative to fixture dir
data_types: [tasks, notes]
file_extensions: [txt, md]
``step1`` — classification prompt only.
``step2`` — processing prompt only.
``full`` — both steps in sequence.
# Preseeded records the agent "sees" as existing data
seed_records:
projects:
- id: proj-1
name: "Website Redesign"
status: active
tasks: []
# Prompt variations to test (at least one required)
prompt_variants:
baseline: |
Extract action items as tasks and meeting summaries as notes.
Set priority based on urgency keywords.
detailed: |
Extract action items as tasks. Map "URGENT" to high priority,
"ASAP" to medium. Summaries become notes with full content.
# Expected extractions — what the agent SHOULD produce
expected:
tasks:
- title: "Send revised invoice to client"
priority: high
status: todo
- title: "Update project timeline"
priority: medium
notes:
- title: "Meeting summary - March kickoff"
# Optional: models to test (overrides CLI --models)
models: []
A *journey fixture* tests the prompt-template builder conversation:
.. code-block:: yaml
type: journey
name: journey-invoices
description: Test journey builds a good template for invoices
directory: sample_files/invoices
data_types: [tasks, notes]
# Simulated user responses for multi-turn conversation
user_messages:
- "I want to extract action items and meeting summaries"
- "Yes, map URGENTE to high priority"
- "That looks good, generate the template"
# Criteria the generated prompt_template should satisfy
expected_template_criteria:
- "mentions tasks and notes as target entities"
- "includes priority mapping rules"
- "references isAiSuggested=1"
- "does not mention projectId"
models: []
2. **Journey fixtures** — test the prompt-template builder conversation
(unchanged).
"""
from __future__ import annotations
@@ -72,12 +18,14 @@ from __future__ import annotations
import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
from typing import Any, Literal
import yaml
logger = logging.getLogger(__name__)
EvalMode = Literal["step1", "step2", "full"]
@dataclass
class ExpectedRecord:
@@ -90,21 +38,52 @@ class ExpectedRecord:
fields: dict[str, Any] # field_name → expected_value
@dataclass
class ExpectedClassification:
"""Expected output of step-1 classification for one file."""
file: str # relative path to the sample file
project_id: str # expected matched project id, or "new"
domains: list[str] # expected domain list
new_project_name: str | None = None
@dataclass
class EvalFixture:
"""A complete test scenario loaded from YAML."""
"""A complete test scenario loaded from YAML.
``mode`` determines which pipeline steps are exercised:
- **step1**: only ``_classify_file``
- **step2**: only the processing LLM + tool loop
- **full**: both steps in sequence (``run_local_agent``)
"""
name: str
description: str
mode: EvalMode
directory: str # relative path to sample files
data_types: list[str]
file_extensions: list[str]
seed_records: dict[str, list[dict]]
prompt_variants: dict[str, str] # variant_name → prompt_template
expected: list[ExpectedRecord]
models: list[str] # if empty, use CLI default
fixture_path: Path = field(default_factory=lambda: Path("."))
# ── Step-1 inputs (classification) ───────────────────────────
domain_definitions: str = ""
projects_list: list[dict[str, Any]] = field(default_factory=list)
# ── Step-2 inputs (processing) ───────────────────────────────
existing_context: str = ""
project_context: str = ""
custom_prompt_section: str = ""
# ── Seed records for mock executor ───────────────────────────
seed_records: dict[str, list[dict]] = field(default_factory=dict)
# ── Expected outputs ─────────────────────────────────────────
expected_classification: list[ExpectedClassification] = field(default_factory=list)
expected: list[ExpectedRecord] = field(default_factory=list)
@property
def fixture_dir(self) -> Path:
"""Absolute path to the sample files directory."""
@@ -115,22 +94,44 @@ class EvalFixture:
"""Load a fixture from a YAML file."""
raw = yaml.safe_load(path.read_text(encoding="utf-8"))
mode: EvalMode = raw.get("mode", "full")
# Parse expected records (step2/full)
expected: list[ExpectedRecord] = []
for table, records in (raw.get("expected") or {}).items():
for rec in records:
expected.append(ExpectedRecord(table=table, fields=rec))
# Parse expected classification (step1/full)
expected_classification: list[ExpectedClassification] = []
for item in raw.get("expected_classification") or []:
expected_classification.append(ExpectedClassification(
file=item["file"],
project_id=item["project_id"],
domains=item.get("domains", []),
new_project_name=item.get("new_project_name"),
))
return cls(
name=raw["name"],
description=raw.get("description", ""),
mode=mode,
directory=raw.get("directory", "sample_files"),
data_types=raw.get("data_types", ["tasks"]),
file_extensions=raw.get("file_extensions", []),
seed_records=raw.get("seed_records", {}),
prompt_variants=raw.get("prompt_variants", {"default": ""}),
expected=expected,
models=raw.get("models", []),
fixture_path=path,
# Step-1 inputs
domain_definitions=raw.get("domain_definitions", ""),
projects_list=raw.get("projects_list", []),
# Step-2 inputs
existing_context=raw.get("existing_context", ""),
project_context=raw.get("project_context", ""),
custom_prompt_section=raw.get("custom_prompt_section", ""),
# Shared
seed_records=raw.get("seed_records", {}),
expected_classification=expected_classification,
expected=expected,
)
@@ -168,9 +169,9 @@ class JourneyFixture:
description: str
directory: str # relative path to sample files
data_types: list[str]
user_messages: list[str] # simulated user responses
expected_template_criteria: list[str] # what the template should contain/satisfy
models: list[str]
user_messages: list[str] = field(default_factory=list) # for automated journey runs (unused in interactive mode)
models: list[str] = field(default_factory=list)
fixture_path: Path = field(default_factory=lambda: Path("."))
@property