- eval/mock_executor.py: intercepts execute_on_client, serves fixture files from disk, records all mutations (insert/update/delete) - eval/config.py: YAML fixture loader with prompt variants, expected results, seed records, model overrides - eval/scorer.py: FieldMatchScorer (fuzzy title match, per-field accuracy, precision/recall/F1) + LLMJudgeScorer (semantic eval) - eval/langfuse_eval.py: sync fixtures to Langfuse datasets, create dataset runs, post scores, link traces to runs - eval/runner.py: orchestrates fixture → mock → agent pipeline → scoring → Langfuse reporting - eval/cli.py: CLI (python -m eval run/list/sync) with --models, --variants, --fixture, --no-judge flags - eval/fixtures/: example Italian freelance scenario with 3 prompt variants (baseline, detailed_italian, minimal)
237 lines
7.8 KiB
Python
237 lines
7.8 KiB
Python
"""Eval runner — orchestrates fixture → mock → agent pipeline → scoring.
|
||
|
||
For each (fixture × model × prompt_variant) combination:
|
||
1. Build a MockExecutor with fixture data
|
||
2. Patch execute_on_client
|
||
3. Override LLM_MODEL in shared settings
|
||
4. Run the batch agent pipeline (run_local_agent)
|
||
5. Collect mutations from the mock
|
||
6. Score against expected results (field match + optional LLM judge)
|
||
7. Report scores to Langfuse
|
||
8. Print results
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import copy
|
||
import json
|
||
import logging
|
||
import time
|
||
import uuid
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
from eval.config import EvalFixture, ExpectedRecord
|
||
from eval.mock_executor import MockExecutor
|
||
from eval.scorer import (
|
||
EvalScores,
|
||
FieldScore,
|
||
compute_precision_recall,
|
||
llm_judge_score,
|
||
score_field_match,
|
||
)
|
||
from eval import langfuse_eval
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
async def run_single_eval(
|
||
fixture: EvalFixture,
|
||
model: str,
|
||
prompt_variant: str,
|
||
*,
|
||
use_llm_judge: bool = True,
|
||
judge_model: str = "gpt-4o-mini",
|
||
) -> EvalScores:
|
||
"""Execute one (fixture × model × prompt_variant) eval and return scores."""
|
||
from shared.config import settings
|
||
|
||
prompt_template = fixture.prompt_variants.get(prompt_variant, "")
|
||
|
||
# Build mock executor
|
||
seed = copy.deepcopy(fixture.seed_records)
|
||
mock = MockExecutor(
|
||
fixture_dir=fixture.fixture_dir,
|
||
seed_records=seed,
|
||
)
|
||
|
||
# Override the LLM model for this run
|
||
original_model = settings.LLM_MODEL
|
||
settings.LLM_MODEL = model
|
||
|
||
# Build trigger data (same shape as what redis_consumer delivers)
|
||
trigger_data: dict[str, Any] = {
|
||
"type": "agent_trigger",
|
||
"directory": fixture.directory,
|
||
"directory_paths": [fixture.directory],
|
||
"data_types": fixture.data_types,
|
||
"file_extensions": fixture.file_extensions,
|
||
"prompt_template": prompt_template,
|
||
"device_id": "eval-harness",
|
||
"run_context": {
|
||
"agent_id": f"eval-{fixture.name}-{prompt_variant}",
|
||
"run_id": None, # skip DB logging during eval
|
||
},
|
||
}
|
||
|
||
eval_user_id = f"eval-{uuid.uuid4().hex[:8]}"
|
||
|
||
logger.info(
|
||
"eval: starting %s | model=%s | variant=%s",
|
||
fixture.name, model, prompt_variant,
|
||
)
|
||
start_time = time.time()
|
||
|
||
try:
|
||
# Patch execute_on_client + set user context, then run the pipeline
|
||
from app.ws_context import set_current_user, clear_current_user
|
||
from app.agent_runner import run_local_agent
|
||
|
||
set_current_user(eval_user_id)
|
||
with mock.patch():
|
||
await run_local_agent(eval_user_id, trigger_data)
|
||
except Exception as exc:
|
||
logger.error("eval: pipeline failed for %s/%s/%s: %s", fixture.name, model, prompt_variant, exc)
|
||
finally:
|
||
settings.LLM_MODEL = original_model
|
||
from app.ws_context import clear_current_user
|
||
clear_current_user()
|
||
|
||
elapsed = time.time() - start_time
|
||
logger.info("eval: pipeline completed in %.1fs — %d mutations", elapsed, len(mock.mutations))
|
||
|
||
# ── Score results ────────────────────────────────────────────
|
||
all_field_scores: list[FieldScore] = []
|
||
total_expected = 0
|
||
total_actual = 0
|
||
total_matched = 0
|
||
total_extra = 0
|
||
total_missing = 0
|
||
|
||
# Group expected by table
|
||
expected_by_table: dict[str, list[dict]] = {}
|
||
for rec in fixture.expected:
|
||
expected_by_table.setdefault(rec.table, []).append(rec.fields)
|
||
|
||
# Compare against actual mutations (inserts + updates)
|
||
tables = set(expected_by_table.keys()) | {m.table for m in mock.mutations}
|
||
for table in tables:
|
||
expected_records = expected_by_table.get(table, [])
|
||
actual_records = mock.created_records(table) + mock.updated_records(table)
|
||
|
||
field_scores, extra, missing = score_field_match(expected_records, actual_records, table)
|
||
all_field_scores.extend(field_scores)
|
||
|
||
matched = sum(1 for s in field_scores if s.best_match is not None)
|
||
total_expected += len(expected_records)
|
||
total_actual += len(actual_records)
|
||
total_matched += matched
|
||
total_extra += extra
|
||
total_missing += missing
|
||
|
||
precision, recall, f1 = compute_precision_recall(total_expected, total_actual, total_matched)
|
||
|
||
scores = EvalScores(
|
||
fixture_name=fixture.name,
|
||
model=model,
|
||
prompt_variant=prompt_variant,
|
||
field_scores=all_field_scores,
|
||
precision=precision,
|
||
recall=recall,
|
||
f1=f1,
|
||
extra_records=total_extra,
|
||
missing_records=total_missing,
|
||
)
|
||
|
||
# ── Optional LLM judge ───────────────────────────────────────
|
||
if use_llm_judge and fixture.expected:
|
||
all_expected = [r.fields for r in fixture.expected]
|
||
all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")]
|
||
judge_score, reasoning = await llm_judge_score(
|
||
all_expected, all_actual, judge_model=judge_model,
|
||
)
|
||
scores.llm_judge_score = judge_score
|
||
scores.llm_judge_reasoning = reasoning
|
||
|
||
# ── Report to Langfuse ───────────────────────────────────────
|
||
dataset_name = f"batch-eval-{fixture.name}"
|
||
dataset_item_id = f"{fixture.name}--{prompt_variant}"
|
||
run_name = f"{model}--{prompt_variant}--{int(time.time())}"
|
||
|
||
trace_id = langfuse_eval.log_eval_trace(
|
||
fixture_name=fixture.name,
|
||
model=model,
|
||
prompt_variant=prompt_variant,
|
||
prompt_template=prompt_template,
|
||
actual_mutations=[{"action": m.action, "table": m.table, "data": m.data} for m in mock.mutations],
|
||
scores_summary=scores.summary(),
|
||
dataset_name=dataset_name,
|
||
run_name=run_name,
|
||
dataset_item_id=dataset_item_id,
|
||
)
|
||
|
||
if trace_id:
|
||
langfuse_eval.post_eval_scores(scores, trace_id=trace_id)
|
||
|
||
return scores
|
||
|
||
|
||
async def run_fixture_eval(
|
||
fixture: EvalFixture,
|
||
models: list[str],
|
||
*,
|
||
variants: list[str] | None = None,
|
||
use_llm_judge: bool = True,
|
||
judge_model: str = "gpt-4o-mini",
|
||
) -> list[EvalScores]:
|
||
"""Run all (model × variant) combinations for a fixture."""
|
||
if variants is None:
|
||
variants = list(fixture.prompt_variants.keys())
|
||
|
||
# Sync fixture to Langfuse dataset
|
||
langfuse_eval.sync_fixture_to_dataset(fixture)
|
||
|
||
results: list[EvalScores] = []
|
||
for model in models:
|
||
for variant in variants:
|
||
if variant not in fixture.prompt_variants:
|
||
logger.warning("eval: variant %r not found in fixture %s", variant, fixture.name)
|
||
continue
|
||
scores = await run_single_eval(
|
||
fixture, model, variant,
|
||
use_llm_judge=use_llm_judge,
|
||
judge_model=judge_model,
|
||
)
|
||
results.append(scores)
|
||
|
||
return results
|
||
|
||
|
||
def print_results(results: list[EvalScores]) -> None:
|
||
"""Print a formatted summary table of eval results."""
|
||
if not results:
|
||
print("\nNo eval results.")
|
||
return
|
||
|
||
print("\n" + "=" * 90)
|
||
print(f"{'Fixture':<25} {'Model':<25} {'Variant':<15} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}")
|
||
print("-" * 90)
|
||
|
||
for s in results:
|
||
llm_str = f"{s.llm_judge_score:.2f}" if s.llm_judge_score is not None else " --"
|
||
print(
|
||
f"{s.fixture_name:<25} {s.model:<25} {s.prompt_variant:<15} "
|
||
f"{s.precision:>6.2f} {s.recall:>6.2f} {s.f1:>6.2f} "
|
||
f"{s.field_accuracy:>6.2f} {llm_str:>6}"
|
||
)
|
||
|
||
print("=" * 90)
|
||
|
||
# If LLM judge reasoning is available, print it
|
||
for s in results:
|
||
if s.llm_judge_reasoning:
|
||
print(f"\n[{s.model} / {s.prompt_variant}] LLM Judge: {s.llm_judge_reasoning}")
|
||
|
||
print()
|