refactor(eval): 3-mode eval harness (step1/step2/full) with Langfuse fixes

- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants
- Rewrite runner with _run_step1, _run_step2, _run_full dispatch
- CLI: replace --variants with --mode flag
- Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full)
- Remove old freelance_invoices fixture
- Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full)
- Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode
- Langfuse: post separate classification_precision/recall/f1 scores for full mode
- Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1)
- Langfuse: include step1_results in trace output
- MockExecutor: mock async_session to bypass DB in full mode
- Journey fixture: remove user_messages (only interactive test kept)
This commit is contained in:
Roberto Musso
2026-03-24 16:18:51 +01:00
parent 63fa119543
commit d3f7099d93
13 changed files with 1409 additions and 439 deletions

View File

@@ -1,28 +1,31 @@
"""Eval runner — orchestrates fixture → mock → agent pipeline → scoring.
For each (fixture × model × prompt_variant) combination:
1. Build a MockExecutor with fixture data
2. Patch execute_on_client
3. Override LLM_MODEL in shared settings
4. Run the batch agent pipeline (run_local_agent)
5. Collect mutations from the mock
6. Score against expected results (field match + optional LLM judge)
7. Report scores to Langfuse
8. Print results
Supports three eval modes:
- **step1**: Test classification prompt only (``_STEP1_SYSTEM_PROMPT``).
Calls the LLM with fixture-provided ``domain_definitions`` and
``projects_list`` and compares output against ``expected_classification``.
- **step2**: Test processing prompt only (``_PROCESSING_SYSTEM_PROMPT``).
Compiles the prompt with fixture-provided ``existing_context``,
``project_context``, ``data_types``, and ``custom_prompt_section``,
then runs the tool-calling loop. Mutations are scored against
``expected`` records.
- **full**: Run ``run_local_agent()`` end-to-end (both steps).
Scored on both classification and extraction.
"""
from __future__ import annotations
import asyncio
import copy
import json
import logging
import time
import uuid
from pathlib import Path
from typing import Any
from eval.config import EvalFixture, ExpectedRecord
from eval.config import EvalFixture, ExpectedClassification
from eval.mock_executor import MockExecutor
from eval.scorer import (
EvalScores,
@@ -36,72 +39,193 @@ from eval import langfuse_eval
logger = logging.getLogger(__name__)
async def run_single_eval(
# ── Step 1 runner ─────────────────────────────────────────────────────────
async def _run_step1(
fixture: EvalFixture,
model: str,
prompt_variant: str,
*,
use_llm_judge: bool = True,
judge_model: str = "gpt-4o-mini",
) -> EvalScores:
"""Execute one (fixture × model × prompt_variant) eval and return scores."""
from shared.config import settings
mock: MockExecutor,
) -> list[dict[str, Any]]:
"""Run step-1 classification for each expected file.
prompt_template = fixture.prompt_variants.get(prompt_variant, "")
Returns a list of result dicts:
``[{file, project_id, domains, new_project_name}, ...]``
"""
from app.agent_runner import _classify_file
# Build mock executor
seed = copy.deepcopy(fixture.seed_records)
mock = MockExecutor(
fixture_dir=fixture.fixture_dir,
seed_records=seed,
results: list[dict[str, Any]] = []
for ec in fixture.expected_classification:
# Read the file content through the mock
file_result = await mock._handle(
action="read_file_content",
data={"path": ec.file},
)
file_content: str = file_result.get("content", "")
project_id, domains, new_name = await _classify_file(
file_path=ec.file,
file_content=file_content,
projects=fixture.projects_list,
config_data_types=fixture.data_types,
)
results.append({
"file": ec.file,
"project_id": project_id,
"domains": domains,
"new_project_name": new_name,
})
return results
def _score_step1(
fixture: EvalFixture,
results: list[dict[str, Any]],
) -> tuple[float, float, float, str]:
"""Score step-1 results. Returns (precision, recall, f1, reasoning)."""
if not fixture.expected_classification:
return 0.0, 0.0, 0.0, "No expected classifications"
total = len(fixture.expected_classification)
matched = 0
details: list[str] = []
for ec in fixture.expected_classification:
actual = next((r for r in results if r["file"] == ec.file), None)
if actual is None:
details.append(f" MISS {ec.file}: not processed")
continue
pid_ok = actual["project_id"] == ec.project_id
domains_ok = set(actual["domains"]) == set(ec.domains) if ec.domains else True
if pid_ok and domains_ok:
matched += 1
details.append(f" OK {ec.file}: project={actual['project_id']}, domains={actual['domains']}")
else:
parts: list[str] = []
if not pid_ok:
parts.append(f"project expected={ec.project_id} got={actual['project_id']}")
if not domains_ok:
parts.append(f"domains expected={ec.domains} got={actual['domains']}")
details.append(f" FAIL {ec.file}: {'; '.join(parts)}")
precision = matched / total if total > 0 else 0.0
recall = precision # in step1, precision == recall (same denominator)
f1 = precision # same
reasoning = "\n".join(details)
return precision, recall, f1, reasoning
# ── Step 2 runner ─────────────────────────────────────────────────────────
async def _run_step2(
fixture: EvalFixture,
model: str,
mock: MockExecutor,
) -> None:
"""Run step-2 processing for each file in the fixture directory.
Compiles ``_PROCESSING_SYSTEM_PROMPT`` with fixture-provided variables
and runs the tool-calling loop. Mutations are captured by the mock.
"""
from app.agent_runner import (
_PROCESSING_SYSTEM_PROMPT,
_build_processing_tools,
_run_agent_with_tools,
_MAX_PROCESSING_STEPS,
)
from app import tracing
# Compile the processing prompt with fixture variables
system_prompt = tracing.compile_prompt(
"batch_processing",
fallback=_PROCESSING_SYSTEM_PROMPT,
variables={
"existing_context": fixture.existing_context,
"project_context": fixture.project_context,
"data_types": ", ".join(fixture.data_types),
"custom_prompt_section": fixture.custom_prompt_section,
},
)
# Override the LLM model for this run
original_model = settings.LLM_MODEL
settings.LLM_MODEL = model
tools = _build_processing_tools(fixture.data_types)
# Scan files in the fixture directory
file_entries = await mock._handle(
action="list_directory",
data={"path": fixture.directory},
)
for entry in file_entries.get("entries", []):
if entry.get("type") != "file":
continue
# Filter by extension if specified
if fixture.file_extensions:
ext = entry["name"].rsplit(".", 1)[-1] if "." in entry["name"] else ""
if ext not in fixture.file_extensions:
continue
file_result = await mock._handle(
action="read_file_content",
data={"path": entry["path"]},
)
file_content: str = file_result.get("content", "")
if not file_content.strip():
continue
await _run_agent_with_tools(
system_prompt=system_prompt,
user_message=(
f"Process this file and extract relevant information.\n\n"
f"File: {entry['path']}\n\nContent:\n{file_content}"
),
tools=tools,
max_steps=_MAX_PROCESSING_STEPS,
)
# ── Full runner ───────────────────────────────────────────────────────────
async def _run_full(
fixture: EvalFixture,
model: str,
mock: MockExecutor,
user_id: str,
) -> None:
"""Run the full two-step pipeline via ``run_local_agent``."""
from app.agent_runner import run_local_agent
# Build trigger data (same shape as what redis_consumer delivers)
trigger_data: dict[str, Any] = {
"type": "agent_trigger",
"directory": fixture.directory,
"directory_paths": [fixture.directory],
"data_types": fixture.data_types,
"file_extensions": fixture.file_extensions,
"prompt_template": prompt_template,
"prompt_template": fixture.custom_prompt_section,
"device_id": "eval-harness",
"run_context": {
"agent_id": f"eval-{fixture.name}-{prompt_variant}",
"run_id": None, # skip DB logging during eval
"agent_id": f"eval-{fixture.name}",
"run_id": None,
},
}
eval_user_id = f"eval-{uuid.uuid4().hex[:8]}"
with mock.patch():
await run_local_agent(user_id, trigger_data)
logger.info(
"eval: starting %s | model=%s | variant=%s",
fixture.name, model, prompt_variant,
)
start_time = time.time()
try:
# Patch execute_on_client + set user context, then run the pipeline
from app.ws_context import set_current_user, clear_current_user
from app.agent_runner import run_local_agent
# ── Scoring helpers ───────────────────────────────────────────────────────
set_current_user(eval_user_id)
with mock.patch():
await run_local_agent(eval_user_id, trigger_data)
except Exception as exc:
logger.error("eval: pipeline failed for %s/%s/%s: %s", fixture.name, model, prompt_variant, exc)
finally:
settings.LLM_MODEL = original_model
from app.ws_context import clear_current_user
clear_current_user()
elapsed = time.time() - start_time
logger.info("eval: pipeline completed in %.1fs — %d mutations", elapsed, len(mock.mutations))
def _score_mutations(
fixture: EvalFixture,
mock: MockExecutor,
) -> tuple[list[FieldScore], float, float, float, int, int]:
"""Score mutations against expected records.
# ── Score results ────────────────────────────────────────────
Returns (field_scores, precision, recall, f1, extra, missing).
"""
all_field_scores: list[FieldScore] = []
total_expected = 0
total_actual = 0
@@ -109,12 +233,10 @@ async def run_single_eval(
total_extra = 0
total_missing = 0
# Group expected by table
expected_by_table: dict[str, list[dict]] = {}
for rec in fixture.expected:
expected_by_table.setdefault(rec.table, []).append(rec.fields)
# Compare against actual mutations (inserts + updates)
tables = set(expected_by_table.keys()) | {m.table for m in mock.mutations}
for table in tables:
expected_records = expected_by_table.get(table, [])
@@ -131,49 +253,160 @@ async def run_single_eval(
total_missing += missing
precision, recall, f1 = compute_precision_recall(total_expected, total_actual, total_matched)
return all_field_scores, precision, recall, f1, total_extra, total_missing
scores = EvalScores(
fixture_name=fixture.name,
model=model,
prompt_variant=prompt_variant,
field_scores=all_field_scores,
precision=precision,
recall=recall,
f1=f1,
extra_records=total_extra,
missing_records=total_missing,
# ── Main entry point ──────────────────────────────────────────────────────
async def run_single_eval(
fixture: EvalFixture,
model: str,
*,
use_llm_judge: bool = True,
judge_model: str = "gpt-4o-mini",
) -> EvalScores:
"""Execute one eval run for a fixture + model. Mode is read from the fixture."""
from shared.config import settings
from shared.ws_context import set_current_user, clear_current_user
seed = copy.deepcopy(fixture.seed_records)
mock = MockExecutor(
fixture_dir=fixture.fixture_path.parent,
seed_records=seed,
)
# ── Optional LLM judge ───────────────────────────────────────
if use_llm_judge and fixture.expected:
all_expected = [r.fields for r in fixture.expected]
all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")]
judge_score, reasoning = await llm_judge_score(
all_expected, all_actual, judge_model=judge_model,
)
scores.llm_judge_score = judge_score
scores.llm_judge_reasoning = reasoning
original_model = settings.LLM_MODEL
settings.LLM_MODEL = model
eval_user_id = str(uuid.uuid4())
# ── Report to Langfuse ───────────────────────────────────────
dataset_name = f"batch-eval-{fixture.name}"
dataset_item_id = f"{fixture.name}--{prompt_variant}"
run_name = f"{model}--{prompt_variant}--{int(time.time())}"
logger.info(
"eval: starting %s | mode=%s | model=%s",
fixture.name, fixture.mode, model,
)
start_time = time.time()
step1_results: list[dict[str, Any]] = []
step1_reasoning = ""
try:
set_current_user(eval_user_id)
if fixture.mode == "step1":
with mock.patch():
step1_results = await _run_step1(fixture, model, mock)
elif fixture.mode == "step2":
with mock.patch():
await _run_step2(fixture, model, mock)
elif fixture.mode == "full":
with mock.patch():
# Step 1 — classification (independent from run_local_agent)
if fixture.expected_classification:
step1_results = await _run_step1(fixture, model, mock)
# Step 2 — full pipeline (run_local_agent handles both steps)
await _run_full(fixture, model, mock, eval_user_id)
except Exception as exc:
logger.error("eval: pipeline failed for %s/%s: %s", fixture.name, model, exc)
finally:
settings.LLM_MODEL = original_model
clear_current_user()
elapsed = time.time() - start_time
logger.info("eval: completed in %.1fs — %d mutations", elapsed, len(mock.mutations))
# ── Score ─────────────────────────────────────────────────────
if fixture.mode == "step1":
s1_precision, s1_recall, s1_f1, step1_reasoning = _score_step1(fixture, step1_results)
scores = EvalScores(
fixture_name=fixture.name,
model=model,
prompt_variant=fixture.mode,
precision=s1_precision,
recall=s1_recall,
f1=s1_f1,
llm_judge_reasoning=step1_reasoning,
)
else:
# step2 or full — score mutations
field_scores, precision, recall, f1, extra, missing = _score_mutations(fixture, mock)
scores = EvalScores(
fixture_name=fixture.name,
model=model,
prompt_variant=fixture.mode,
field_scores=field_scores,
precision=precision,
recall=recall,
f1=f1,
extra_records=extra,
missing_records=missing,
)
# Add step1 classification scores for full mode
if fixture.mode == "full" and fixture.expected_classification:
s1_p, s1_r, s1_f1, step1_reasoning = _score_step1(fixture, step1_results)
scores.llm_judge_reasoning = f"Step1 classification:\n{step1_reasoning}"
# Optional LLM judge for extraction quality
if use_llm_judge and fixture.expected:
all_expected = [r.fields for r in fixture.expected]
all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")]
judge_score, reasoning = await llm_judge_score(
all_expected, all_actual, judge_model=judge_model,
)
scores.llm_judge_score = judge_score
if step1_reasoning:
scores.llm_judge_reasoning += f"\n\nLLM judge:\n{reasoning}"
else:
scores.llm_judge_reasoning = reasoning
# ── Report to Langfuse ────────────────────────────────────────
prompt_names = {
"step1": ["batch_file_classifier"],
"step2": ["batch_processing"],
"full": ["batch_file_classifier", "batch_processing"],
}.get(fixture.mode, ["batch_processing"])
trace_id = langfuse_eval.log_eval_trace(
fixture_name=fixture.name,
model=model,
prompt_variant=prompt_variant,
prompt_template=prompt_template,
prompt_variant=fixture.mode,
prompt_template=fixture.custom_prompt_section or "(default)",
actual_mutations=[{"action": m.action, "table": m.table, "data": m.data} for m in mock.mutations],
scores_summary=scores.summary(),
dataset_name=dataset_name,
run_name=run_name,
dataset_item_id=dataset_item_id,
step1_results=step1_results or None,
langfuse_prompt_names=prompt_names,
)
if trace_id:
langfuse_eval.post_eval_scores(scores, trace_id=trace_id)
# For full mode, post classification scores separately
if fixture.mode == "full" and fixture.expected_classification:
s1_p, s1_r, s1_f1, _ = _score_step1(fixture, step1_results)
for name, value in [
("classification_precision", s1_p),
("classification_recall", s1_r),
("classification_f1", s1_f1),
]:
try:
from langfuse import get_client
lf = get_client()
if lf:
lf.create_score(
name=name,
value=value,
trace_id=trace_id,
data_type="NUMERIC",
comment=f"{fixture.name} | {model} | full",
)
except Exception:
pass
return scores
@@ -181,29 +414,20 @@ async def run_fixture_eval(
fixture: EvalFixture,
models: list[str],
*,
variants: list[str] | None = None,
use_llm_judge: bool = True,
judge_model: str = "gpt-4o-mini",
) -> list[EvalScores]:
"""Run all (model × variant) combinations for a fixture."""
if variants is None:
variants = list(fixture.prompt_variants.keys())
# Sync fixture to Langfuse dataset
"""Run all models for a fixture."""
langfuse_eval.sync_fixture_to_dataset(fixture)
results: list[EvalScores] = []
for model in models:
for variant in variants:
if variant not in fixture.prompt_variants:
logger.warning("eval: variant %r not found in fixture %s", variant, fixture.name)
continue
scores = await run_single_eval(
fixture, model, variant,
use_llm_judge=use_llm_judge,
judge_model=judge_model,
)
results.append(scores)
scores = await run_single_eval(
fixture, model,
use_llm_judge=use_llm_judge,
judge_model=judge_model,
)
results.append(scores)
return results
@@ -214,18 +438,21 @@ def print_results(results: list[EvalScores]) -> None:
print("\nNo eval results.")
return
print("\n" + "=" * 90)
print(f"{'Fixture':<25} {'Model':<25} {'Variant':<15} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}")
print("-" * 90)
print("\n" + "=" * 95)
print(f"{'Fixture':<25} {'Mode':<6} {'Model':<25} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}")
print("-" * 95)
for s in results:
llm_str = f"{s.llm_judge_score:.2f}" if s.llm_judge_score is not None else " --"
print(
f"{s.fixture_name:<25} {s.model:<25} {s.prompt_variant:<15} "
f"{s.fixture_name:<25} {s.prompt_variant:<6} {s.model:<25} "
f"{s.precision:>6.2f} {s.recall:>6.2f} {s.f1:>6.2f} "
f"{s.field_accuracy:>6.2f} {llm_str:>6}"
)
print("=" * 95)
print()
print("=" * 90)
# If LLM judge reasoning is available, print it