refactor(eval): 3-mode eval harness (step1/step2/full) with Langfuse fixes
- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants - Rewrite runner with _run_step1, _run_step2, _run_full dispatch - CLI: replace --variants with --mode flag - Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full) - Remove old freelance_invoices fixture - Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full) - Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode - Langfuse: post separate classification_precision/recall/f1 scores for full mode - Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1) - Langfuse: include step1_results in trace output - MockExecutor: mock async_session to bypass DB in full mode - Journey fixture: remove user_messages (only interactive test kept)
This commit is contained in:
@@ -1,70 +1,16 @@
|
||||
"""Eval configuration — YAML fixture loader and dataclasses.
|
||||
|
||||
A *fixture* is a YAML file that defines a complete test scenario:
|
||||
Fixtures come in two families:
|
||||
|
||||
.. code-block:: yaml
|
||||
1. **Agent fixtures** — test the batch agent pipeline.
|
||||
Three modes controlled by ``mode``:
|
||||
|
||||
name: freelance-invoices
|
||||
description: Extract tasks and notes from invoice PDFs (text layer)
|
||||
directory: sample_files/invoices # relative to fixture dir
|
||||
data_types: [tasks, notes]
|
||||
file_extensions: [txt, md]
|
||||
``step1`` — classification prompt only.
|
||||
``step2`` — processing prompt only.
|
||||
``full`` — both steps in sequence.
|
||||
|
||||
# Preseeded records the agent "sees" as existing data
|
||||
seed_records:
|
||||
projects:
|
||||
- id: proj-1
|
||||
name: "Website Redesign"
|
||||
status: active
|
||||
tasks: []
|
||||
|
||||
# Prompt variations to test (at least one required)
|
||||
prompt_variants:
|
||||
baseline: |
|
||||
Extract action items as tasks and meeting summaries as notes.
|
||||
Set priority based on urgency keywords.
|
||||
detailed: |
|
||||
Extract action items as tasks. Map "URGENT" to high priority,
|
||||
"ASAP" to medium. Summaries become notes with full content.
|
||||
|
||||
# Expected extractions — what the agent SHOULD produce
|
||||
expected:
|
||||
tasks:
|
||||
- title: "Send revised invoice to client"
|
||||
priority: high
|
||||
status: todo
|
||||
- title: "Update project timeline"
|
||||
priority: medium
|
||||
notes:
|
||||
- title: "Meeting summary - March kickoff"
|
||||
|
||||
# Optional: models to test (overrides CLI --models)
|
||||
models: []
|
||||
|
||||
A *journey fixture* tests the prompt-template builder conversation:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
type: journey
|
||||
name: journey-invoices
|
||||
description: Test journey builds a good template for invoices
|
||||
directory: sample_files/invoices
|
||||
data_types: [tasks, notes]
|
||||
|
||||
# Simulated user responses for multi-turn conversation
|
||||
user_messages:
|
||||
- "I want to extract action items and meeting summaries"
|
||||
- "Yes, map URGENTE to high priority"
|
||||
- "That looks good, generate the template"
|
||||
|
||||
# Criteria the generated prompt_template should satisfy
|
||||
expected_template_criteria:
|
||||
- "mentions tasks and notes as target entities"
|
||||
- "includes priority mapping rules"
|
||||
- "references isAiSuggested=1"
|
||||
- "does not mention projectId"
|
||||
|
||||
models: []
|
||||
2. **Journey fixtures** — test the prompt-template builder conversation
|
||||
(unchanged).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -72,12 +18,14 @@ from __future__ import annotations
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import Any, Literal
|
||||
|
||||
import yaml
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
EvalMode = Literal["step1", "step2", "full"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExpectedRecord:
|
||||
@@ -90,21 +38,52 @@ class ExpectedRecord:
|
||||
fields: dict[str, Any] # field_name → expected_value
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExpectedClassification:
|
||||
"""Expected output of step-1 classification for one file."""
|
||||
|
||||
file: str # relative path to the sample file
|
||||
project_id: str # expected matched project id, or "new"
|
||||
domains: list[str] # expected domain list
|
||||
new_project_name: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvalFixture:
|
||||
"""A complete test scenario loaded from YAML."""
|
||||
"""A complete test scenario loaded from YAML.
|
||||
|
||||
``mode`` determines which pipeline steps are exercised:
|
||||
|
||||
- **step1**: only ``_classify_file``
|
||||
- **step2**: only the processing LLM + tool loop
|
||||
- **full**: both steps in sequence (``run_local_agent``)
|
||||
"""
|
||||
|
||||
name: str
|
||||
description: str
|
||||
mode: EvalMode
|
||||
directory: str # relative path to sample files
|
||||
data_types: list[str]
|
||||
file_extensions: list[str]
|
||||
seed_records: dict[str, list[dict]]
|
||||
prompt_variants: dict[str, str] # variant_name → prompt_template
|
||||
expected: list[ExpectedRecord]
|
||||
models: list[str] # if empty, use CLI default
|
||||
fixture_path: Path = field(default_factory=lambda: Path("."))
|
||||
|
||||
# ── Step-1 inputs (classification) ───────────────────────────
|
||||
domain_definitions: str = ""
|
||||
projects_list: list[dict[str, Any]] = field(default_factory=list)
|
||||
|
||||
# ── Step-2 inputs (processing) ───────────────────────────────
|
||||
existing_context: str = ""
|
||||
project_context: str = ""
|
||||
custom_prompt_section: str = ""
|
||||
|
||||
# ── Seed records for mock executor ───────────────────────────
|
||||
seed_records: dict[str, list[dict]] = field(default_factory=dict)
|
||||
|
||||
# ── Expected outputs ─────────────────────────────────────────
|
||||
expected_classification: list[ExpectedClassification] = field(default_factory=list)
|
||||
expected: list[ExpectedRecord] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def fixture_dir(self) -> Path:
|
||||
"""Absolute path to the sample files directory."""
|
||||
@@ -115,22 +94,44 @@ class EvalFixture:
|
||||
"""Load a fixture from a YAML file."""
|
||||
raw = yaml.safe_load(path.read_text(encoding="utf-8"))
|
||||
|
||||
mode: EvalMode = raw.get("mode", "full")
|
||||
|
||||
# Parse expected records (step2/full)
|
||||
expected: list[ExpectedRecord] = []
|
||||
for table, records in (raw.get("expected") or {}).items():
|
||||
for rec in records:
|
||||
expected.append(ExpectedRecord(table=table, fields=rec))
|
||||
|
||||
# Parse expected classification (step1/full)
|
||||
expected_classification: list[ExpectedClassification] = []
|
||||
for item in raw.get("expected_classification") or []:
|
||||
expected_classification.append(ExpectedClassification(
|
||||
file=item["file"],
|
||||
project_id=item["project_id"],
|
||||
domains=item.get("domains", []),
|
||||
new_project_name=item.get("new_project_name"),
|
||||
))
|
||||
|
||||
return cls(
|
||||
name=raw["name"],
|
||||
description=raw.get("description", ""),
|
||||
mode=mode,
|
||||
directory=raw.get("directory", "sample_files"),
|
||||
data_types=raw.get("data_types", ["tasks"]),
|
||||
file_extensions=raw.get("file_extensions", []),
|
||||
seed_records=raw.get("seed_records", {}),
|
||||
prompt_variants=raw.get("prompt_variants", {"default": ""}),
|
||||
expected=expected,
|
||||
models=raw.get("models", []),
|
||||
fixture_path=path,
|
||||
# Step-1 inputs
|
||||
domain_definitions=raw.get("domain_definitions", ""),
|
||||
projects_list=raw.get("projects_list", []),
|
||||
# Step-2 inputs
|
||||
existing_context=raw.get("existing_context", ""),
|
||||
project_context=raw.get("project_context", ""),
|
||||
custom_prompt_section=raw.get("custom_prompt_section", ""),
|
||||
# Shared
|
||||
seed_records=raw.get("seed_records", {}),
|
||||
expected_classification=expected_classification,
|
||||
expected=expected,
|
||||
)
|
||||
|
||||
|
||||
@@ -168,9 +169,9 @@ class JourneyFixture:
|
||||
description: str
|
||||
directory: str # relative path to sample files
|
||||
data_types: list[str]
|
||||
user_messages: list[str] # simulated user responses
|
||||
expected_template_criteria: list[str] # what the template should contain/satisfy
|
||||
models: list[str]
|
||||
user_messages: list[str] = field(default_factory=list) # for automated journey runs (unused in interactive mode)
|
||||
models: list[str] = field(default_factory=list)
|
||||
fixture_path: Path = field(default_factory=lambda: Path("."))
|
||||
|
||||
@property
|
||||
|
||||
Reference in New Issue
Block a user