feat(batch-agent): add E2E evaluation harness with Langfuse integration
- eval/mock_executor.py: intercepts execute_on_client, serves fixture files from disk, records all mutations (insert/update/delete) - eval/config.py: YAML fixture loader with prompt variants, expected results, seed records, model overrides - eval/scorer.py: FieldMatchScorer (fuzzy title match, per-field accuracy, precision/recall/F1) + LLMJudgeScorer (semantic eval) - eval/langfuse_eval.py: sync fixtures to Langfuse datasets, create dataset runs, post scores, link traces to runs - eval/runner.py: orchestrates fixture → mock → agent pipeline → scoring → Langfuse reporting - eval/cli.py: CLI (python -m eval run/list/sync) with --models, --variants, --fixture, --no-judge flags - eval/fixtures/: example Italian freelance scenario with 3 prompt variants (baseline, detailed_italian, minimal)
This commit is contained in:
236
services/batch-agent/eval/runner.py
Normal file
236
services/batch-agent/eval/runner.py
Normal file
@@ -0,0 +1,236 @@
|
||||
"""Eval runner — orchestrates fixture → mock → agent pipeline → scoring.
|
||||
|
||||
For each (fixture × model × prompt_variant) combination:
|
||||
1. Build a MockExecutor with fixture data
|
||||
2. Patch execute_on_client
|
||||
3. Override LLM_MODEL in shared settings
|
||||
4. Run the batch agent pipeline (run_local_agent)
|
||||
5. Collect mutations from the mock
|
||||
6. Score against expected results (field match + optional LLM judge)
|
||||
7. Report scores to Langfuse
|
||||
8. Print results
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import copy
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from eval.config import EvalFixture, ExpectedRecord
|
||||
from eval.mock_executor import MockExecutor
|
||||
from eval.scorer import (
|
||||
EvalScores,
|
||||
FieldScore,
|
||||
compute_precision_recall,
|
||||
llm_judge_score,
|
||||
score_field_match,
|
||||
)
|
||||
from eval import langfuse_eval
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def run_single_eval(
|
||||
fixture: EvalFixture,
|
||||
model: str,
|
||||
prompt_variant: str,
|
||||
*,
|
||||
use_llm_judge: bool = True,
|
||||
judge_model: str = "gpt-4o-mini",
|
||||
) -> EvalScores:
|
||||
"""Execute one (fixture × model × prompt_variant) eval and return scores."""
|
||||
from shared.config import settings
|
||||
|
||||
prompt_template = fixture.prompt_variants.get(prompt_variant, "")
|
||||
|
||||
# Build mock executor
|
||||
seed = copy.deepcopy(fixture.seed_records)
|
||||
mock = MockExecutor(
|
||||
fixture_dir=fixture.fixture_dir,
|
||||
seed_records=seed,
|
||||
)
|
||||
|
||||
# Override the LLM model for this run
|
||||
original_model = settings.LLM_MODEL
|
||||
settings.LLM_MODEL = model
|
||||
|
||||
# Build trigger data (same shape as what redis_consumer delivers)
|
||||
trigger_data: dict[str, Any] = {
|
||||
"type": "agent_trigger",
|
||||
"directory": fixture.directory,
|
||||
"directory_paths": [fixture.directory],
|
||||
"data_types": fixture.data_types,
|
||||
"file_extensions": fixture.file_extensions,
|
||||
"prompt_template": prompt_template,
|
||||
"device_id": "eval-harness",
|
||||
"run_context": {
|
||||
"agent_id": f"eval-{fixture.name}-{prompt_variant}",
|
||||
"run_id": None, # skip DB logging during eval
|
||||
},
|
||||
}
|
||||
|
||||
eval_user_id = f"eval-{uuid.uuid4().hex[:8]}"
|
||||
|
||||
logger.info(
|
||||
"eval: starting %s | model=%s | variant=%s",
|
||||
fixture.name, model, prompt_variant,
|
||||
)
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Patch execute_on_client + set user context, then run the pipeline
|
||||
from app.ws_context import set_current_user, clear_current_user
|
||||
from app.agent_runner import run_local_agent
|
||||
|
||||
set_current_user(eval_user_id)
|
||||
with mock.patch():
|
||||
await run_local_agent(eval_user_id, trigger_data)
|
||||
except Exception as exc:
|
||||
logger.error("eval: pipeline failed for %s/%s/%s: %s", fixture.name, model, prompt_variant, exc)
|
||||
finally:
|
||||
settings.LLM_MODEL = original_model
|
||||
from app.ws_context import clear_current_user
|
||||
clear_current_user()
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
logger.info("eval: pipeline completed in %.1fs — %d mutations", elapsed, len(mock.mutations))
|
||||
|
||||
# ── Score results ────────────────────────────────────────────
|
||||
all_field_scores: list[FieldScore] = []
|
||||
total_expected = 0
|
||||
total_actual = 0
|
||||
total_matched = 0
|
||||
total_extra = 0
|
||||
total_missing = 0
|
||||
|
||||
# Group expected by table
|
||||
expected_by_table: dict[str, list[dict]] = {}
|
||||
for rec in fixture.expected:
|
||||
expected_by_table.setdefault(rec.table, []).append(rec.fields)
|
||||
|
||||
# Compare against actual mutations (inserts + updates)
|
||||
tables = set(expected_by_table.keys()) | {m.table for m in mock.mutations}
|
||||
for table in tables:
|
||||
expected_records = expected_by_table.get(table, [])
|
||||
actual_records = mock.created_records(table) + mock.updated_records(table)
|
||||
|
||||
field_scores, extra, missing = score_field_match(expected_records, actual_records, table)
|
||||
all_field_scores.extend(field_scores)
|
||||
|
||||
matched = sum(1 for s in field_scores if s.best_match is not None)
|
||||
total_expected += len(expected_records)
|
||||
total_actual += len(actual_records)
|
||||
total_matched += matched
|
||||
total_extra += extra
|
||||
total_missing += missing
|
||||
|
||||
precision, recall, f1 = compute_precision_recall(total_expected, total_actual, total_matched)
|
||||
|
||||
scores = EvalScores(
|
||||
fixture_name=fixture.name,
|
||||
model=model,
|
||||
prompt_variant=prompt_variant,
|
||||
field_scores=all_field_scores,
|
||||
precision=precision,
|
||||
recall=recall,
|
||||
f1=f1,
|
||||
extra_records=total_extra,
|
||||
missing_records=total_missing,
|
||||
)
|
||||
|
||||
# ── Optional LLM judge ───────────────────────────────────────
|
||||
if use_llm_judge and fixture.expected:
|
||||
all_expected = [r.fields for r in fixture.expected]
|
||||
all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")]
|
||||
judge_score, reasoning = await llm_judge_score(
|
||||
all_expected, all_actual, judge_model=judge_model,
|
||||
)
|
||||
scores.llm_judge_score = judge_score
|
||||
scores.llm_judge_reasoning = reasoning
|
||||
|
||||
# ── Report to Langfuse ───────────────────────────────────────
|
||||
dataset_name = f"batch-eval-{fixture.name}"
|
||||
dataset_item_id = f"{fixture.name}--{prompt_variant}"
|
||||
run_name = f"{model}--{prompt_variant}--{int(time.time())}"
|
||||
|
||||
trace_id = langfuse_eval.log_eval_trace(
|
||||
fixture_name=fixture.name,
|
||||
model=model,
|
||||
prompt_variant=prompt_variant,
|
||||
prompt_template=prompt_template,
|
||||
actual_mutations=[{"action": m.action, "table": m.table, "data": m.data} for m in mock.mutations],
|
||||
scores_summary=scores.summary(),
|
||||
dataset_name=dataset_name,
|
||||
run_name=run_name,
|
||||
dataset_item_id=dataset_item_id,
|
||||
)
|
||||
|
||||
if trace_id:
|
||||
langfuse_eval.post_eval_scores(scores, trace_id=trace_id)
|
||||
|
||||
return scores
|
||||
|
||||
|
||||
async def run_fixture_eval(
|
||||
fixture: EvalFixture,
|
||||
models: list[str],
|
||||
*,
|
||||
variants: list[str] | None = None,
|
||||
use_llm_judge: bool = True,
|
||||
judge_model: str = "gpt-4o-mini",
|
||||
) -> list[EvalScores]:
|
||||
"""Run all (model × variant) combinations for a fixture."""
|
||||
if variants is None:
|
||||
variants = list(fixture.prompt_variants.keys())
|
||||
|
||||
# Sync fixture to Langfuse dataset
|
||||
langfuse_eval.sync_fixture_to_dataset(fixture)
|
||||
|
||||
results: list[EvalScores] = []
|
||||
for model in models:
|
||||
for variant in variants:
|
||||
if variant not in fixture.prompt_variants:
|
||||
logger.warning("eval: variant %r not found in fixture %s", variant, fixture.name)
|
||||
continue
|
||||
scores = await run_single_eval(
|
||||
fixture, model, variant,
|
||||
use_llm_judge=use_llm_judge,
|
||||
judge_model=judge_model,
|
||||
)
|
||||
results.append(scores)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def print_results(results: list[EvalScores]) -> None:
|
||||
"""Print a formatted summary table of eval results."""
|
||||
if not results:
|
||||
print("\nNo eval results.")
|
||||
return
|
||||
|
||||
print("\n" + "=" * 90)
|
||||
print(f"{'Fixture':<25} {'Model':<25} {'Variant':<15} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}")
|
||||
print("-" * 90)
|
||||
|
||||
for s in results:
|
||||
llm_str = f"{s.llm_judge_score:.2f}" if s.llm_judge_score is not None else " --"
|
||||
print(
|
||||
f"{s.fixture_name:<25} {s.model:<25} {s.prompt_variant:<15} "
|
||||
f"{s.precision:>6.2f} {s.recall:>6.2f} {s.f1:>6.2f} "
|
||||
f"{s.field_accuracy:>6.2f} {llm_str:>6}"
|
||||
)
|
||||
|
||||
print("=" * 90)
|
||||
|
||||
# If LLM judge reasoning is available, print it
|
||||
for s in results:
|
||||
if s.llm_judge_reasoning:
|
||||
print(f"\n[{s.model} / {s.prompt_variant}] LLM Judge: {s.llm_judge_reasoning}")
|
||||
|
||||
print()
|
||||
Reference in New Issue
Block a user