refactor(eval): 3-mode eval harness (step1/step2/full) with Langfuse fixes
- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants - Rewrite runner with _run_step1, _run_step2, _run_full dispatch - CLI: replace --variants with --mode flag - Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full) - Remove old freelance_invoices fixture - Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full) - Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode - Langfuse: post separate classification_precision/recall/f1 scores for full mode - Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1) - Langfuse: include step1_results in trace output - MockExecutor: mock async_session to bypass DB in full mode - Journey fixture: remove user_messages (only interactive test kept)
This commit is contained in:
@@ -1,28 +1,31 @@
|
||||
"""Eval runner — orchestrates fixture → mock → agent pipeline → scoring.
|
||||
|
||||
For each (fixture × model × prompt_variant) combination:
|
||||
1. Build a MockExecutor with fixture data
|
||||
2. Patch execute_on_client
|
||||
3. Override LLM_MODEL in shared settings
|
||||
4. Run the batch agent pipeline (run_local_agent)
|
||||
5. Collect mutations from the mock
|
||||
6. Score against expected results (field match + optional LLM judge)
|
||||
7. Report scores to Langfuse
|
||||
8. Print results
|
||||
Supports three eval modes:
|
||||
|
||||
- **step1**: Test classification prompt only (``_STEP1_SYSTEM_PROMPT``).
|
||||
Calls the LLM with fixture-provided ``domain_definitions`` and
|
||||
``projects_list`` and compares output against ``expected_classification``.
|
||||
|
||||
- **step2**: Test processing prompt only (``_PROCESSING_SYSTEM_PROMPT``).
|
||||
Compiles the prompt with fixture-provided ``existing_context``,
|
||||
``project_context``, ``data_types``, and ``custom_prompt_section``,
|
||||
then runs the tool-calling loop. Mutations are scored against
|
||||
``expected`` records.
|
||||
|
||||
- **full**: Run ``run_local_agent()`` end-to-end (both steps).
|
||||
Scored on both classification and extraction.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import copy
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from eval.config import EvalFixture, ExpectedRecord
|
||||
from eval.config import EvalFixture, ExpectedClassification
|
||||
from eval.mock_executor import MockExecutor
|
||||
from eval.scorer import (
|
||||
EvalScores,
|
||||
@@ -36,72 +39,193 @@ from eval import langfuse_eval
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def run_single_eval(
|
||||
# ── Step 1 runner ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def _run_step1(
|
||||
fixture: EvalFixture,
|
||||
model: str,
|
||||
prompt_variant: str,
|
||||
*,
|
||||
use_llm_judge: bool = True,
|
||||
judge_model: str = "gpt-4o-mini",
|
||||
) -> EvalScores:
|
||||
"""Execute one (fixture × model × prompt_variant) eval and return scores."""
|
||||
from shared.config import settings
|
||||
mock: MockExecutor,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Run step-1 classification for each expected file.
|
||||
|
||||
prompt_template = fixture.prompt_variants.get(prompt_variant, "")
|
||||
Returns a list of result dicts:
|
||||
``[{file, project_id, domains, new_project_name}, ...]``
|
||||
"""
|
||||
from app.agent_runner import _classify_file
|
||||
|
||||
# Build mock executor
|
||||
seed = copy.deepcopy(fixture.seed_records)
|
||||
mock = MockExecutor(
|
||||
fixture_dir=fixture.fixture_dir,
|
||||
seed_records=seed,
|
||||
results: list[dict[str, Any]] = []
|
||||
for ec in fixture.expected_classification:
|
||||
# Read the file content through the mock
|
||||
file_result = await mock._handle(
|
||||
action="read_file_content",
|
||||
data={"path": ec.file},
|
||||
)
|
||||
file_content: str = file_result.get("content", "")
|
||||
|
||||
project_id, domains, new_name = await _classify_file(
|
||||
file_path=ec.file,
|
||||
file_content=file_content,
|
||||
projects=fixture.projects_list,
|
||||
config_data_types=fixture.data_types,
|
||||
)
|
||||
results.append({
|
||||
"file": ec.file,
|
||||
"project_id": project_id,
|
||||
"domains": domains,
|
||||
"new_project_name": new_name,
|
||||
})
|
||||
return results
|
||||
|
||||
|
||||
def _score_step1(
|
||||
fixture: EvalFixture,
|
||||
results: list[dict[str, Any]],
|
||||
) -> tuple[float, float, float, str]:
|
||||
"""Score step-1 results. Returns (precision, recall, f1, reasoning)."""
|
||||
if not fixture.expected_classification:
|
||||
return 0.0, 0.0, 0.0, "No expected classifications"
|
||||
|
||||
total = len(fixture.expected_classification)
|
||||
matched = 0
|
||||
details: list[str] = []
|
||||
|
||||
for ec in fixture.expected_classification:
|
||||
actual = next((r for r in results if r["file"] == ec.file), None)
|
||||
if actual is None:
|
||||
details.append(f" MISS {ec.file}: not processed")
|
||||
continue
|
||||
|
||||
pid_ok = actual["project_id"] == ec.project_id
|
||||
domains_ok = set(actual["domains"]) == set(ec.domains) if ec.domains else True
|
||||
|
||||
if pid_ok and domains_ok:
|
||||
matched += 1
|
||||
details.append(f" OK {ec.file}: project={actual['project_id']}, domains={actual['domains']}")
|
||||
else:
|
||||
parts: list[str] = []
|
||||
if not pid_ok:
|
||||
parts.append(f"project expected={ec.project_id} got={actual['project_id']}")
|
||||
if not domains_ok:
|
||||
parts.append(f"domains expected={ec.domains} got={actual['domains']}")
|
||||
details.append(f" FAIL {ec.file}: {'; '.join(parts)}")
|
||||
|
||||
precision = matched / total if total > 0 else 0.0
|
||||
recall = precision # in step1, precision == recall (same denominator)
|
||||
f1 = precision # same
|
||||
reasoning = "\n".join(details)
|
||||
return precision, recall, f1, reasoning
|
||||
|
||||
|
||||
# ── Step 2 runner ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def _run_step2(
|
||||
fixture: EvalFixture,
|
||||
model: str,
|
||||
mock: MockExecutor,
|
||||
) -> None:
|
||||
"""Run step-2 processing for each file in the fixture directory.
|
||||
|
||||
Compiles ``_PROCESSING_SYSTEM_PROMPT`` with fixture-provided variables
|
||||
and runs the tool-calling loop. Mutations are captured by the mock.
|
||||
"""
|
||||
from app.agent_runner import (
|
||||
_PROCESSING_SYSTEM_PROMPT,
|
||||
_build_processing_tools,
|
||||
_run_agent_with_tools,
|
||||
_MAX_PROCESSING_STEPS,
|
||||
)
|
||||
from app import tracing
|
||||
|
||||
# Compile the processing prompt with fixture variables
|
||||
system_prompt = tracing.compile_prompt(
|
||||
"batch_processing",
|
||||
fallback=_PROCESSING_SYSTEM_PROMPT,
|
||||
variables={
|
||||
"existing_context": fixture.existing_context,
|
||||
"project_context": fixture.project_context,
|
||||
"data_types": ", ".join(fixture.data_types),
|
||||
"custom_prompt_section": fixture.custom_prompt_section,
|
||||
},
|
||||
)
|
||||
|
||||
# Override the LLM model for this run
|
||||
original_model = settings.LLM_MODEL
|
||||
settings.LLM_MODEL = model
|
||||
tools = _build_processing_tools(fixture.data_types)
|
||||
|
||||
# Scan files in the fixture directory
|
||||
file_entries = await mock._handle(
|
||||
action="list_directory",
|
||||
data={"path": fixture.directory},
|
||||
)
|
||||
for entry in file_entries.get("entries", []):
|
||||
if entry.get("type") != "file":
|
||||
continue
|
||||
# Filter by extension if specified
|
||||
if fixture.file_extensions:
|
||||
ext = entry["name"].rsplit(".", 1)[-1] if "." in entry["name"] else ""
|
||||
if ext not in fixture.file_extensions:
|
||||
continue
|
||||
|
||||
file_result = await mock._handle(
|
||||
action="read_file_content",
|
||||
data={"path": entry["path"]},
|
||||
)
|
||||
file_content: str = file_result.get("content", "")
|
||||
if not file_content.strip():
|
||||
continue
|
||||
|
||||
await _run_agent_with_tools(
|
||||
system_prompt=system_prompt,
|
||||
user_message=(
|
||||
f"Process this file and extract relevant information.\n\n"
|
||||
f"File: {entry['path']}\n\nContent:\n{file_content}"
|
||||
),
|
||||
tools=tools,
|
||||
max_steps=_MAX_PROCESSING_STEPS,
|
||||
)
|
||||
|
||||
|
||||
# ── Full runner ───────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def _run_full(
|
||||
fixture: EvalFixture,
|
||||
model: str,
|
||||
mock: MockExecutor,
|
||||
user_id: str,
|
||||
) -> None:
|
||||
"""Run the full two-step pipeline via ``run_local_agent``."""
|
||||
from app.agent_runner import run_local_agent
|
||||
|
||||
# Build trigger data (same shape as what redis_consumer delivers)
|
||||
trigger_data: dict[str, Any] = {
|
||||
"type": "agent_trigger",
|
||||
"directory": fixture.directory,
|
||||
"directory_paths": [fixture.directory],
|
||||
"data_types": fixture.data_types,
|
||||
"file_extensions": fixture.file_extensions,
|
||||
"prompt_template": prompt_template,
|
||||
"prompt_template": fixture.custom_prompt_section,
|
||||
"device_id": "eval-harness",
|
||||
"run_context": {
|
||||
"agent_id": f"eval-{fixture.name}-{prompt_variant}",
|
||||
"run_id": None, # skip DB logging during eval
|
||||
"agent_id": f"eval-{fixture.name}",
|
||||
"run_id": None,
|
||||
},
|
||||
}
|
||||
|
||||
eval_user_id = f"eval-{uuid.uuid4().hex[:8]}"
|
||||
with mock.patch():
|
||||
await run_local_agent(user_id, trigger_data)
|
||||
|
||||
logger.info(
|
||||
"eval: starting %s | model=%s | variant=%s",
|
||||
fixture.name, model, prompt_variant,
|
||||
)
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Patch execute_on_client + set user context, then run the pipeline
|
||||
from app.ws_context import set_current_user, clear_current_user
|
||||
from app.agent_runner import run_local_agent
|
||||
# ── Scoring helpers ───────────────────────────────────────────────────────
|
||||
|
||||
set_current_user(eval_user_id)
|
||||
with mock.patch():
|
||||
await run_local_agent(eval_user_id, trigger_data)
|
||||
except Exception as exc:
|
||||
logger.error("eval: pipeline failed for %s/%s/%s: %s", fixture.name, model, prompt_variant, exc)
|
||||
finally:
|
||||
settings.LLM_MODEL = original_model
|
||||
from app.ws_context import clear_current_user
|
||||
clear_current_user()
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
logger.info("eval: pipeline completed in %.1fs — %d mutations", elapsed, len(mock.mutations))
|
||||
def _score_mutations(
|
||||
fixture: EvalFixture,
|
||||
mock: MockExecutor,
|
||||
) -> tuple[list[FieldScore], float, float, float, int, int]:
|
||||
"""Score mutations against expected records.
|
||||
|
||||
# ── Score results ────────────────────────────────────────────
|
||||
Returns (field_scores, precision, recall, f1, extra, missing).
|
||||
"""
|
||||
all_field_scores: list[FieldScore] = []
|
||||
total_expected = 0
|
||||
total_actual = 0
|
||||
@@ -109,12 +233,10 @@ async def run_single_eval(
|
||||
total_extra = 0
|
||||
total_missing = 0
|
||||
|
||||
# Group expected by table
|
||||
expected_by_table: dict[str, list[dict]] = {}
|
||||
for rec in fixture.expected:
|
||||
expected_by_table.setdefault(rec.table, []).append(rec.fields)
|
||||
|
||||
# Compare against actual mutations (inserts + updates)
|
||||
tables = set(expected_by_table.keys()) | {m.table for m in mock.mutations}
|
||||
for table in tables:
|
||||
expected_records = expected_by_table.get(table, [])
|
||||
@@ -131,49 +253,160 @@ async def run_single_eval(
|
||||
total_missing += missing
|
||||
|
||||
precision, recall, f1 = compute_precision_recall(total_expected, total_actual, total_matched)
|
||||
return all_field_scores, precision, recall, f1, total_extra, total_missing
|
||||
|
||||
scores = EvalScores(
|
||||
fixture_name=fixture.name,
|
||||
model=model,
|
||||
prompt_variant=prompt_variant,
|
||||
field_scores=all_field_scores,
|
||||
precision=precision,
|
||||
recall=recall,
|
||||
f1=f1,
|
||||
extra_records=total_extra,
|
||||
missing_records=total_missing,
|
||||
|
||||
# ── Main entry point ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def run_single_eval(
|
||||
fixture: EvalFixture,
|
||||
model: str,
|
||||
*,
|
||||
use_llm_judge: bool = True,
|
||||
judge_model: str = "gpt-4o-mini",
|
||||
) -> EvalScores:
|
||||
"""Execute one eval run for a fixture + model. Mode is read from the fixture."""
|
||||
from shared.config import settings
|
||||
from shared.ws_context import set_current_user, clear_current_user
|
||||
|
||||
seed = copy.deepcopy(fixture.seed_records)
|
||||
mock = MockExecutor(
|
||||
fixture_dir=fixture.fixture_path.parent,
|
||||
seed_records=seed,
|
||||
)
|
||||
|
||||
# ── Optional LLM judge ───────────────────────────────────────
|
||||
if use_llm_judge and fixture.expected:
|
||||
all_expected = [r.fields for r in fixture.expected]
|
||||
all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")]
|
||||
judge_score, reasoning = await llm_judge_score(
|
||||
all_expected, all_actual, judge_model=judge_model,
|
||||
)
|
||||
scores.llm_judge_score = judge_score
|
||||
scores.llm_judge_reasoning = reasoning
|
||||
original_model = settings.LLM_MODEL
|
||||
settings.LLM_MODEL = model
|
||||
eval_user_id = str(uuid.uuid4())
|
||||
|
||||
# ── Report to Langfuse ───────────────────────────────────────
|
||||
dataset_name = f"batch-eval-{fixture.name}"
|
||||
dataset_item_id = f"{fixture.name}--{prompt_variant}"
|
||||
run_name = f"{model}--{prompt_variant}--{int(time.time())}"
|
||||
logger.info(
|
||||
"eval: starting %s | mode=%s | model=%s",
|
||||
fixture.name, fixture.mode, model,
|
||||
)
|
||||
start_time = time.time()
|
||||
|
||||
step1_results: list[dict[str, Any]] = []
|
||||
step1_reasoning = ""
|
||||
|
||||
try:
|
||||
set_current_user(eval_user_id)
|
||||
|
||||
if fixture.mode == "step1":
|
||||
with mock.patch():
|
||||
step1_results = await _run_step1(fixture, model, mock)
|
||||
|
||||
elif fixture.mode == "step2":
|
||||
with mock.patch():
|
||||
await _run_step2(fixture, model, mock)
|
||||
|
||||
elif fixture.mode == "full":
|
||||
with mock.patch():
|
||||
# Step 1 — classification (independent from run_local_agent)
|
||||
if fixture.expected_classification:
|
||||
step1_results = await _run_step1(fixture, model, mock)
|
||||
|
||||
# Step 2 — full pipeline (run_local_agent handles both steps)
|
||||
await _run_full(fixture, model, mock, eval_user_id)
|
||||
|
||||
except Exception as exc:
|
||||
logger.error("eval: pipeline failed for %s/%s: %s", fixture.name, model, exc)
|
||||
finally:
|
||||
settings.LLM_MODEL = original_model
|
||||
clear_current_user()
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
logger.info("eval: completed in %.1fs — %d mutations", elapsed, len(mock.mutations))
|
||||
|
||||
# ── Score ─────────────────────────────────────────────────────
|
||||
|
||||
if fixture.mode == "step1":
|
||||
s1_precision, s1_recall, s1_f1, step1_reasoning = _score_step1(fixture, step1_results)
|
||||
scores = EvalScores(
|
||||
fixture_name=fixture.name,
|
||||
model=model,
|
||||
prompt_variant=fixture.mode,
|
||||
precision=s1_precision,
|
||||
recall=s1_recall,
|
||||
f1=s1_f1,
|
||||
llm_judge_reasoning=step1_reasoning,
|
||||
)
|
||||
else:
|
||||
# step2 or full — score mutations
|
||||
field_scores, precision, recall, f1, extra, missing = _score_mutations(fixture, mock)
|
||||
scores = EvalScores(
|
||||
fixture_name=fixture.name,
|
||||
model=model,
|
||||
prompt_variant=fixture.mode,
|
||||
field_scores=field_scores,
|
||||
precision=precision,
|
||||
recall=recall,
|
||||
f1=f1,
|
||||
extra_records=extra,
|
||||
missing_records=missing,
|
||||
)
|
||||
|
||||
# Add step1 classification scores for full mode
|
||||
if fixture.mode == "full" and fixture.expected_classification:
|
||||
s1_p, s1_r, s1_f1, step1_reasoning = _score_step1(fixture, step1_results)
|
||||
scores.llm_judge_reasoning = f"Step1 classification:\n{step1_reasoning}"
|
||||
|
||||
# Optional LLM judge for extraction quality
|
||||
if use_llm_judge and fixture.expected:
|
||||
all_expected = [r.fields for r in fixture.expected]
|
||||
all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")]
|
||||
judge_score, reasoning = await llm_judge_score(
|
||||
all_expected, all_actual, judge_model=judge_model,
|
||||
)
|
||||
scores.llm_judge_score = judge_score
|
||||
if step1_reasoning:
|
||||
scores.llm_judge_reasoning += f"\n\nLLM judge:\n{reasoning}"
|
||||
else:
|
||||
scores.llm_judge_reasoning = reasoning
|
||||
|
||||
# ── Report to Langfuse ────────────────────────────────────────
|
||||
prompt_names = {
|
||||
"step1": ["batch_file_classifier"],
|
||||
"step2": ["batch_processing"],
|
||||
"full": ["batch_file_classifier", "batch_processing"],
|
||||
}.get(fixture.mode, ["batch_processing"])
|
||||
|
||||
trace_id = langfuse_eval.log_eval_trace(
|
||||
fixture_name=fixture.name,
|
||||
model=model,
|
||||
prompt_variant=prompt_variant,
|
||||
prompt_template=prompt_template,
|
||||
prompt_variant=fixture.mode,
|
||||
prompt_template=fixture.custom_prompt_section or "(default)",
|
||||
actual_mutations=[{"action": m.action, "table": m.table, "data": m.data} for m in mock.mutations],
|
||||
scores_summary=scores.summary(),
|
||||
dataset_name=dataset_name,
|
||||
run_name=run_name,
|
||||
dataset_item_id=dataset_item_id,
|
||||
step1_results=step1_results or None,
|
||||
langfuse_prompt_names=prompt_names,
|
||||
)
|
||||
|
||||
if trace_id:
|
||||
langfuse_eval.post_eval_scores(scores, trace_id=trace_id)
|
||||
|
||||
# For full mode, post classification scores separately
|
||||
if fixture.mode == "full" and fixture.expected_classification:
|
||||
s1_p, s1_r, s1_f1, _ = _score_step1(fixture, step1_results)
|
||||
for name, value in [
|
||||
("classification_precision", s1_p),
|
||||
("classification_recall", s1_r),
|
||||
("classification_f1", s1_f1),
|
||||
]:
|
||||
try:
|
||||
from langfuse import get_client
|
||||
lf = get_client()
|
||||
if lf:
|
||||
lf.create_score(
|
||||
name=name,
|
||||
value=value,
|
||||
trace_id=trace_id,
|
||||
data_type="NUMERIC",
|
||||
comment=f"{fixture.name} | {model} | full",
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return scores
|
||||
|
||||
|
||||
@@ -181,29 +414,20 @@ async def run_fixture_eval(
|
||||
fixture: EvalFixture,
|
||||
models: list[str],
|
||||
*,
|
||||
variants: list[str] | None = None,
|
||||
use_llm_judge: bool = True,
|
||||
judge_model: str = "gpt-4o-mini",
|
||||
) -> list[EvalScores]:
|
||||
"""Run all (model × variant) combinations for a fixture."""
|
||||
if variants is None:
|
||||
variants = list(fixture.prompt_variants.keys())
|
||||
|
||||
# Sync fixture to Langfuse dataset
|
||||
"""Run all models for a fixture."""
|
||||
langfuse_eval.sync_fixture_to_dataset(fixture)
|
||||
|
||||
results: list[EvalScores] = []
|
||||
for model in models:
|
||||
for variant in variants:
|
||||
if variant not in fixture.prompt_variants:
|
||||
logger.warning("eval: variant %r not found in fixture %s", variant, fixture.name)
|
||||
continue
|
||||
scores = await run_single_eval(
|
||||
fixture, model, variant,
|
||||
use_llm_judge=use_llm_judge,
|
||||
judge_model=judge_model,
|
||||
)
|
||||
results.append(scores)
|
||||
scores = await run_single_eval(
|
||||
fixture, model,
|
||||
use_llm_judge=use_llm_judge,
|
||||
judge_model=judge_model,
|
||||
)
|
||||
results.append(scores)
|
||||
|
||||
return results
|
||||
|
||||
@@ -214,18 +438,21 @@ def print_results(results: list[EvalScores]) -> None:
|
||||
print("\nNo eval results.")
|
||||
return
|
||||
|
||||
print("\n" + "=" * 90)
|
||||
print(f"{'Fixture':<25} {'Model':<25} {'Variant':<15} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}")
|
||||
print("-" * 90)
|
||||
print("\n" + "=" * 95)
|
||||
print(f"{'Fixture':<25} {'Mode':<6} {'Model':<25} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}")
|
||||
print("-" * 95)
|
||||
|
||||
for s in results:
|
||||
llm_str = f"{s.llm_judge_score:.2f}" if s.llm_judge_score is not None else " --"
|
||||
print(
|
||||
f"{s.fixture_name:<25} {s.model:<25} {s.prompt_variant:<15} "
|
||||
f"{s.fixture_name:<25} {s.prompt_variant:<6} {s.model:<25} "
|
||||
f"{s.precision:>6.2f} {s.recall:>6.2f} {s.f1:>6.2f} "
|
||||
f"{s.field_accuracy:>6.2f} {llm_str:>6}"
|
||||
)
|
||||
|
||||
print("=" * 95)
|
||||
print()
|
||||
|
||||
print("=" * 90)
|
||||
|
||||
# If LLM judge reasoning is available, print it
|
||||
|
||||
Reference in New Issue
Block a user