546 lines
19 KiB
Python
546 lines
19 KiB
Python
"""Eval runner — orchestrates fixture → mock → agent pipeline → scoring.
|
|
|
|
Supports three eval modes:
|
|
|
|
- **step1**: Test classification prompt only (``_STEP1_SYSTEM_PROMPT``).
|
|
Calls the LLM with fixture-provided ``domain_definitions`` and
|
|
``projects_list`` and compares output against ``expected_classification``.
|
|
|
|
- **step2**: Test processing prompt only (``_PROCESSING_SYSTEM_PROMPT``).
|
|
Compiles the prompt with fixture-provided ``existing_context``,
|
|
``project_context``, ``data_types``, and ``custom_prompt_section``,
|
|
then runs the tool-calling loop. Mutations are scored against
|
|
``expected`` records.
|
|
|
|
- **full**: Run ``run_local_agent()`` end-to-end (both steps).
|
|
Scored on both classification and extraction.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import copy
|
|
import json
|
|
import logging
|
|
import time
|
|
import uuid
|
|
from typing import Any
|
|
|
|
from eval.config import EvalFixture, ExpectedClassification
|
|
from eval.mock_executor import MockExecutor
|
|
from eval.scorer import (
|
|
EvalScores,
|
|
FieldScore,
|
|
compute_precision_recall,
|
|
llm_judge_score,
|
|
score_field_match,
|
|
)
|
|
from eval import langfuse_eval
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ── Step 1 runner ─────────────────────────────────────────────────────────
|
|
|
|
|
|
async def _run_step1(
|
|
fixture: EvalFixture,
|
|
model: str,
|
|
mock: MockExecutor,
|
|
) -> list[dict[str, Any]]:
|
|
"""Run step-1 classification for every file in the fixture directory.
|
|
|
|
Scans the directory recursively, classifies each file, and returns
|
|
a list of result dicts:
|
|
``[{file, project_id, domains, new_project_name}, ...]``
|
|
"""
|
|
from app.agent_runner import _classify_file
|
|
|
|
# Build project name lookup for display
|
|
proj_names: dict[str, str] = {
|
|
p.get("id", ""): p.get("name", "") for p in fixture.projects_list
|
|
}
|
|
|
|
# Discover all files in the fixture directory
|
|
all_files = await _scan_fixture_files(mock, fixture.directory)
|
|
print(f"\n Scanning {len(all_files)} files in {fixture.directory}\n")
|
|
|
|
results: list[dict[str, Any]] = []
|
|
for i, file_path in enumerate(all_files, 1):
|
|
file_result = await mock._handle(
|
|
action="read_file_content",
|
|
data={"path": file_path},
|
|
)
|
|
file_content: str = file_result.get("content", "")
|
|
if not file_content.strip():
|
|
continue
|
|
|
|
project_id, domains, new_name = await _classify_file(
|
|
file_path=file_path,
|
|
file_content=file_content,
|
|
projects=fixture.projects_list,
|
|
config_data_types=fixture.data_types,
|
|
custom_system_prompt=fixture.custom_step1_prompt or None,
|
|
)
|
|
|
|
short_name = file_path.rsplit("/", 1)[-1] if "/" in file_path else file_path
|
|
proj_label = proj_names.get(project_id, new_name or "?")
|
|
print(f" [{i}/{len(all_files)}] {short_name} → {project_id} ({proj_label}) {domains}")
|
|
|
|
results.append({
|
|
"file": file_path,
|
|
"project_id": project_id,
|
|
"domains": domains,
|
|
"new_project_name": new_name,
|
|
})
|
|
return results
|
|
|
|
|
|
async def _scan_fixture_files(mock: MockExecutor, directory: str) -> list[str]:
|
|
"""Recursively list all files under *directory* via the mock executor."""
|
|
files: list[str] = []
|
|
|
|
async def _walk(path: str) -> None:
|
|
result = await mock._handle(action="list_directory", data={"path": path})
|
|
for entry in result.get("entries", []):
|
|
if entry.get("type") == "directory":
|
|
await _walk(entry["path"])
|
|
elif entry.get("type") == "file":
|
|
files.append(entry["path"])
|
|
|
|
await _walk(directory)
|
|
return sorted(files)
|
|
|
|
|
|
def _score_step1(
|
|
fixture: EvalFixture,
|
|
results: list[dict[str, Any]],
|
|
) -> tuple[float, float, float, str]:
|
|
"""Score step-1 results. Returns (precision, recall, f1, reasoning).
|
|
|
|
Files with expected classifications are scored (OK/FAIL).
|
|
Files without expectations are shown as informational (INFO).
|
|
"""
|
|
if not fixture.expected_classification:
|
|
return 0.0, 0.0, 0.0, "No expected classifications"
|
|
|
|
# Build project name lookup
|
|
proj_names: dict[str, str] = {
|
|
p.get("id", ""): p.get("name", "") for p in fixture.projects_list
|
|
}
|
|
proj_names["new"] = "(new project)"
|
|
|
|
def _proj_label(pid: str, new_name: str | None = None) -> str:
|
|
name = proj_names.get(pid, "?")
|
|
if pid == "new" and new_name:
|
|
return f"new → \"{new_name}\""
|
|
return f"{pid} ({name})" if name and name != "?" else pid
|
|
|
|
def _short_file(path: str) -> str:
|
|
"""Use just the filename for cleaner display."""
|
|
return path.rsplit("/", 1)[-1] if "/" in path else path
|
|
|
|
expected_files = {ec.file for ec in fixture.expected_classification}
|
|
total = len(fixture.expected_classification)
|
|
matched = 0
|
|
|
|
scored_lines: list[str] = []
|
|
info_lines: list[str] = []
|
|
|
|
# Score expected files
|
|
for ec in fixture.expected_classification:
|
|
actual = next((r for r in results if r["file"] == ec.file), None)
|
|
fname = _short_file(ec.file)
|
|
if actual is None:
|
|
scored_lines.append(f" MISS {fname}")
|
|
scored_lines.append(f" expected: {_proj_label(ec.project_id)}")
|
|
continue
|
|
|
|
pid_ok = actual["project_id"] == ec.project_id
|
|
domains_ok = set(actual["domains"]) == set(ec.domains) if ec.domains else True
|
|
|
|
if pid_ok and domains_ok:
|
|
matched += 1
|
|
scored_lines.append(f" OK {fname}")
|
|
scored_lines.append(f" project: {_proj_label(actual['project_id'])}")
|
|
scored_lines.append(f" domains: {actual['domains']}")
|
|
else:
|
|
scored_lines.append(f" FAIL {fname}")
|
|
if not pid_ok:
|
|
scored_lines.append(f" project: {_proj_label(actual['project_id'])} (expected: {_proj_label(ec.project_id)})")
|
|
else:
|
|
scored_lines.append(f" project: {_proj_label(actual['project_id'])}")
|
|
if not domains_ok:
|
|
scored_lines.append(f" domains: {actual['domains']} (expected: {ec.domains})")
|
|
else:
|
|
scored_lines.append(f" domains: {actual['domains']}")
|
|
|
|
# Show unscored files
|
|
for r in results:
|
|
if r["file"] not in expected_files:
|
|
fname = _short_file(r["file"])
|
|
proj = _proj_label(r["project_id"], r.get("new_project_name"))
|
|
info_lines.append(f" · {fname}")
|
|
info_lines.append(f" project: {proj} | domains: {r['domains']}")
|
|
|
|
precision = matched / total if total > 0 else 0.0
|
|
recall = precision
|
|
f1 = precision
|
|
|
|
parts: list[str] = []
|
|
if scored_lines:
|
|
parts.append(f"Scored ({matched}/{total}):")
|
|
parts.extend(scored_lines)
|
|
if info_lines:
|
|
parts.append(f"\nOther files ({len(info_lines) // 2}):")
|
|
parts.extend(info_lines)
|
|
|
|
return precision, recall, f1, "\n".join(parts)
|
|
|
|
|
|
# ── Step 2 runner ─────────────────────────────────────────────────────────
|
|
|
|
|
|
async def _run_step2(
|
|
fixture: EvalFixture,
|
|
model: str,
|
|
mock: MockExecutor,
|
|
) -> None:
|
|
"""Run step-2 processing for each file in the fixture directory.
|
|
|
|
Compiles ``_PROCESSING_SYSTEM_PROMPT`` with fixture-provided variables
|
|
and runs the tool-calling loop. Mutations are captured by the mock.
|
|
"""
|
|
from app.agent_runner import (
|
|
_PROCESSING_SYSTEM_PROMPT,
|
|
_build_processing_tools,
|
|
_run_agent_with_tools,
|
|
_MAX_PROCESSING_STEPS,
|
|
)
|
|
from app import tracing
|
|
|
|
# Compile the processing prompt with fixture variables
|
|
system_prompt = tracing.compile_prompt(
|
|
"batch_processing",
|
|
fallback=_PROCESSING_SYSTEM_PROMPT,
|
|
variables={
|
|
"existing_context": fixture.existing_context,
|
|
"project_context": fixture.project_context,
|
|
"data_types": ", ".join(fixture.data_types),
|
|
"custom_prompt_section": fixture.custom_prompt_section,
|
|
},
|
|
)
|
|
|
|
tools = _build_processing_tools(fixture.data_types)
|
|
|
|
# Scan files in the fixture directory
|
|
file_entries = await mock._handle(
|
|
action="list_directory",
|
|
data={"path": fixture.directory},
|
|
)
|
|
for entry in file_entries.get("entries", []):
|
|
if entry.get("type") != "file":
|
|
continue
|
|
# Filter by extension if specified
|
|
if fixture.file_extensions:
|
|
ext = entry["name"].rsplit(".", 1)[-1] if "." in entry["name"] else ""
|
|
if ext not in fixture.file_extensions:
|
|
continue
|
|
|
|
file_result = await mock._handle(
|
|
action="read_file_content",
|
|
data={"path": entry["path"]},
|
|
)
|
|
file_content: str = file_result.get("content", "")
|
|
if not file_content.strip():
|
|
continue
|
|
|
|
await _run_agent_with_tools(
|
|
system_prompt=system_prompt,
|
|
user_message=(
|
|
f"Process this file and extract relevant information.\n\n"
|
|
f"File: {entry['path']}\n\nContent:\n{file_content}"
|
|
),
|
|
tools=tools,
|
|
max_steps=_MAX_PROCESSING_STEPS,
|
|
)
|
|
|
|
|
|
# ── Full runner ───────────────────────────────────────────────────────────
|
|
|
|
|
|
async def _run_full(
|
|
fixture: EvalFixture,
|
|
model: str,
|
|
mock: MockExecutor,
|
|
user_id: str,
|
|
) -> None:
|
|
"""Run the full two-step pipeline via ``run_local_agent``."""
|
|
from app.agent_runner import run_local_agent
|
|
|
|
trigger_data: dict[str, Any] = {
|
|
"type": "agent_trigger",
|
|
"directory": fixture.directory,
|
|
"directory_paths": [fixture.directory],
|
|
"data_types": fixture.data_types,
|
|
"file_extensions": fixture.file_extensions,
|
|
"prompt_template": fixture.custom_prompt_section,
|
|
"device_id": "eval-harness",
|
|
"run_context": {
|
|
"agent_id": f"eval-{fixture.name}",
|
|
"run_id": None,
|
|
},
|
|
}
|
|
|
|
with mock.patch():
|
|
await run_local_agent(user_id, trigger_data)
|
|
|
|
|
|
# ── Scoring helpers ───────────────────────────────────────────────────────
|
|
|
|
|
|
def _score_mutations(
|
|
fixture: EvalFixture,
|
|
mock: MockExecutor,
|
|
) -> tuple[list[FieldScore], float, float, float, int, int]:
|
|
"""Score mutations against expected records.
|
|
|
|
Returns (field_scores, precision, recall, f1, extra, missing).
|
|
"""
|
|
all_field_scores: list[FieldScore] = []
|
|
total_expected = 0
|
|
total_actual = 0
|
|
total_matched = 0
|
|
total_extra = 0
|
|
total_missing = 0
|
|
|
|
expected_by_table: dict[str, list[dict]] = {}
|
|
for rec in fixture.expected:
|
|
expected_by_table.setdefault(rec.table, []).append(rec.fields)
|
|
|
|
tables = set(expected_by_table.keys()) | {m.table for m in mock.mutations}
|
|
for table in tables:
|
|
expected_records = expected_by_table.get(table, [])
|
|
actual_records = mock.created_records(table) + mock.updated_records(table)
|
|
|
|
field_scores, extra, missing = score_field_match(expected_records, actual_records, table)
|
|
all_field_scores.extend(field_scores)
|
|
|
|
matched = sum(1 for s in field_scores if s.best_match is not None)
|
|
total_expected += len(expected_records)
|
|
total_actual += len(actual_records)
|
|
total_matched += matched
|
|
total_extra += extra
|
|
total_missing += missing
|
|
|
|
precision, recall, f1 = compute_precision_recall(total_expected, total_actual, total_matched)
|
|
return all_field_scores, precision, recall, f1, total_extra, total_missing
|
|
|
|
|
|
# ── Main entry point ──────────────────────────────────────────────────────
|
|
|
|
|
|
async def run_single_eval(
|
|
fixture: EvalFixture,
|
|
model: str,
|
|
*,
|
|
use_llm_judge: bool = True,
|
|
judge_model: str = "gpt-4o-mini",
|
|
) -> EvalScores:
|
|
"""Execute one eval run for a fixture + model. Mode is read from the fixture."""
|
|
from shared.config import settings
|
|
from shared.ws_context import set_current_user, clear_current_user
|
|
|
|
seed = copy.deepcopy(fixture.seed_records)
|
|
mock = MockExecutor(
|
|
fixture_dir=fixture.fixture_path.parent,
|
|
seed_records=seed,
|
|
)
|
|
|
|
original_model = settings.LLM_MODEL
|
|
settings.LLM_MODEL = model
|
|
eval_user_id = str(uuid.uuid4())
|
|
|
|
logger.info(
|
|
"eval: starting %s | mode=%s | model=%s",
|
|
fixture.name, fixture.mode, model,
|
|
)
|
|
start_time = time.time()
|
|
|
|
step1_results: list[dict[str, Any]] = []
|
|
step1_reasoning = ""
|
|
|
|
try:
|
|
set_current_user(eval_user_id)
|
|
|
|
if fixture.mode == "step1":
|
|
with mock.patch():
|
|
step1_results = await _run_step1(fixture, model, mock)
|
|
|
|
elif fixture.mode == "step2":
|
|
with mock.patch():
|
|
await _run_step2(fixture, model, mock)
|
|
|
|
elif fixture.mode == "full":
|
|
with mock.patch():
|
|
# Step 1 — classification (independent from run_local_agent)
|
|
if fixture.expected_classification:
|
|
step1_results = await _run_step1(fixture, model, mock)
|
|
|
|
# Step 2 — full pipeline (run_local_agent handles both steps)
|
|
await _run_full(fixture, model, mock, eval_user_id)
|
|
|
|
except Exception as exc:
|
|
logger.error("eval: pipeline failed for %s/%s: %s", fixture.name, model, exc)
|
|
finally:
|
|
settings.LLM_MODEL = original_model
|
|
clear_current_user()
|
|
|
|
elapsed = time.time() - start_time
|
|
logger.info("eval: completed in %.1fs — %d mutations", elapsed, len(mock.mutations))
|
|
|
|
# ── Score ─────────────────────────────────────────────────────
|
|
|
|
if fixture.mode == "step1":
|
|
s1_precision, s1_recall, s1_f1, step1_reasoning = _score_step1(fixture, step1_results)
|
|
scores = EvalScores(
|
|
fixture_name=fixture.name,
|
|
model=model,
|
|
prompt_variant=fixture.mode,
|
|
precision=s1_precision,
|
|
recall=s1_recall,
|
|
f1=s1_f1,
|
|
llm_judge_reasoning=step1_reasoning,
|
|
)
|
|
else:
|
|
# step2 or full — score mutations
|
|
field_scores, precision, recall, f1, extra, missing = _score_mutations(fixture, mock)
|
|
scores = EvalScores(
|
|
fixture_name=fixture.name,
|
|
model=model,
|
|
prompt_variant=fixture.mode,
|
|
field_scores=field_scores,
|
|
precision=precision,
|
|
recall=recall,
|
|
f1=f1,
|
|
extra_records=extra,
|
|
missing_records=missing,
|
|
)
|
|
|
|
# Add step1 classification scores for full mode
|
|
if fixture.mode == "full" and fixture.expected_classification:
|
|
s1_p, s1_r, s1_f1, step1_reasoning = _score_step1(fixture, step1_results)
|
|
scores.llm_judge_reasoning = f"Step1 classification:\n{step1_reasoning}"
|
|
|
|
# Optional LLM judge for extraction quality
|
|
if use_llm_judge and fixture.expected:
|
|
all_expected = [r.fields for r in fixture.expected]
|
|
all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")]
|
|
judge_score, reasoning = await llm_judge_score(
|
|
all_expected, all_actual, judge_model=judge_model,
|
|
)
|
|
scores.llm_judge_score = judge_score
|
|
if step1_reasoning:
|
|
scores.llm_judge_reasoning += f"\n\nLLM judge:\n{reasoning}"
|
|
else:
|
|
scores.llm_judge_reasoning = reasoning
|
|
|
|
# ── Report to Langfuse ────────────────────────────────────────
|
|
prompt_names = {
|
|
"step1": ["batch_file_classifier"],
|
|
"step2": ["batch_processing"],
|
|
"full": ["batch_file_classifier", "batch_processing"],
|
|
}.get(fixture.mode, ["batch_processing"])
|
|
|
|
trace_id = langfuse_eval.log_eval_trace(
|
|
fixture_name=fixture.name,
|
|
model=model,
|
|
prompt_variant=fixture.mode,
|
|
prompt_template=fixture.custom_prompt_section or "(default)",
|
|
actual_mutations=[{"action": m.action, "table": m.table, "data": m.data} for m in mock.mutations],
|
|
scores_summary=scores.summary(),
|
|
step1_results=step1_results or None,
|
|
langfuse_prompt_names=prompt_names,
|
|
)
|
|
|
|
if trace_id:
|
|
langfuse_eval.post_eval_scores(scores, trace_id=trace_id)
|
|
|
|
# For full mode, post classification scores separately
|
|
if fixture.mode == "full" and fixture.expected_classification:
|
|
s1_p, s1_r, s1_f1, _ = _score_step1(fixture, step1_results)
|
|
for name, value in [
|
|
("classification_precision", s1_p),
|
|
("classification_recall", s1_r),
|
|
("classification_f1", s1_f1),
|
|
]:
|
|
try:
|
|
from langfuse import get_client
|
|
lf = get_client()
|
|
if lf:
|
|
lf.create_score(
|
|
name=name,
|
|
value=value,
|
|
trace_id=trace_id,
|
|
data_type="NUMERIC",
|
|
comment=f"{fixture.name} | {model} | full",
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
return scores
|
|
|
|
|
|
async def run_fixture_eval(
|
|
fixture: EvalFixture,
|
|
models: list[str],
|
|
*,
|
|
use_llm_judge: bool = True,
|
|
judge_model: str = "gpt-4o-mini",
|
|
) -> list[EvalScores]:
|
|
"""Run all models for a fixture."""
|
|
langfuse_eval.sync_fixture_to_dataset(fixture)
|
|
|
|
results: list[EvalScores] = []
|
|
for model in models:
|
|
scores = await run_single_eval(
|
|
fixture, model,
|
|
use_llm_judge=use_llm_judge,
|
|
judge_model=judge_model,
|
|
)
|
|
results.append(scores)
|
|
|
|
return results
|
|
|
|
|
|
def print_results(results: list[EvalScores]) -> None:
|
|
"""Print a formatted summary table of eval results."""
|
|
if not results:
|
|
print("\nNo eval results.")
|
|
return
|
|
|
|
W = 90
|
|
|
|
print("\n" + "=" * W)
|
|
print(f"{'Fixture':<25} {'Mode':<6} {'Model':<25} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}")
|
|
print("-" * W)
|
|
|
|
for s in results:
|
|
llm_str = f"{s.llm_judge_score:.2f}" if s.llm_judge_score is not None else " --"
|
|
fa_str = f"{s.field_accuracy:.2f}" if s.field_scores else " --"
|
|
print(
|
|
f"{s.fixture_name:<25} {s.prompt_variant:<6} {s.model:<25} "
|
|
f"{s.precision:>6.2f} {s.recall:>6.2f} {s.f1:>6.2f} "
|
|
f"{fa_str:>6} {llm_str:>6}"
|
|
)
|
|
|
|
print("=" * W)
|
|
|
|
for s in results:
|
|
if s.llm_judge_reasoning:
|
|
print(f"\n{'─' * W}")
|
|
print(f" {s.fixture_name} | {s.model} | {s.prompt_variant}")
|
|
print(f"{'─' * W}")
|
|
print(s.llm_judge_reasoning)
|
|
|
|
print()
|