"""Eval runner — orchestrates fixture → mock → agent pipeline → scoring. Supports three eval modes: - **step1**: Test classification prompt only (``_STEP1_SYSTEM_PROMPT``). Calls the LLM with fixture-provided ``domain_definitions`` and ``projects_list`` and compares output against ``expected_classification``. - **step2**: Test processing prompt only (``_PROCESSING_SYSTEM_PROMPT``). Compiles the prompt with fixture-provided ``existing_context``, ``project_context``, ``data_types``, and ``custom_prompt_section``, then runs the tool-calling loop. Mutations are scored against ``expected`` records. - **full**: Run ``run_local_agent()`` end-to-end (both steps). Scored on both classification and extraction. """ from __future__ import annotations import copy import json import logging import time import uuid from typing import Any from eval.config import EvalFixture, ExpectedClassification from eval.mock_executor import MockExecutor from eval.scorer import ( EvalScores, FieldScore, compute_precision_recall, llm_judge_score, score_field_match, ) from eval import langfuse_eval logger = logging.getLogger(__name__) # ── Step 1 runner ───────────────────────────────────────────────────────── async def _run_step1( fixture: EvalFixture, model: str, mock: MockExecutor, ) -> list[dict[str, Any]]: """Run step-1 classification for every file in the fixture directory. Scans the directory recursively, classifies each file, and returns a list of result dicts: ``[{file, project_id, domains, new_project_name}, ...]`` """ from app.agent_runner import _classify_file # Build project name lookup for display proj_names: dict[str, str] = { p.get("id", ""): p.get("name", "") for p in fixture.projects_list } # Discover all files in the fixture directory all_files = await _scan_fixture_files(mock, fixture.directory) print(f"\n Scanning {len(all_files)} files in {fixture.directory}\n") results: list[dict[str, Any]] = [] for i, file_path in enumerate(all_files, 1): file_result = await mock._handle( action="read_file_content", data={"path": file_path}, ) file_content: str = file_result.get("content", "") if not file_content.strip(): continue project_id, domains, new_name = await _classify_file( file_path=file_path, file_content=file_content, projects=fixture.projects_list, config_data_types=fixture.data_types, custom_system_prompt=fixture.custom_step1_prompt or None, ) short_name = file_path.rsplit("/", 1)[-1] if "/" in file_path else file_path proj_label = proj_names.get(project_id, new_name or "?") print(f" [{i}/{len(all_files)}] {short_name} → {project_id} ({proj_label}) {domains}") results.append({ "file": file_path, "project_id": project_id, "domains": domains, "new_project_name": new_name, }) return results async def _scan_fixture_files(mock: MockExecutor, directory: str) -> list[str]: """Recursively list all files under *directory* via the mock executor.""" files: list[str] = [] async def _walk(path: str) -> None: result = await mock._handle(action="list_directory", data={"path": path}) for entry in result.get("entries", []): if entry.get("type") == "directory": await _walk(entry["path"]) elif entry.get("type") == "file": files.append(entry["path"]) await _walk(directory) return sorted(files) def _score_step1( fixture: EvalFixture, results: list[dict[str, Any]], ) -> tuple[float, float, float, str]: """Score step-1 results. Returns (precision, recall, f1, reasoning). Files with expected classifications are scored (OK/FAIL). Files without expectations are shown as informational (INFO). """ if not fixture.expected_classification: return 0.0, 0.0, 0.0, "No expected classifications" # Build project name lookup proj_names: dict[str, str] = { p.get("id", ""): p.get("name", "") for p in fixture.projects_list } proj_names["new"] = "(new project)" def _proj_label(pid: str, new_name: str | None = None) -> str: name = proj_names.get(pid, "?") if pid == "new" and new_name: return f"new → \"{new_name}\"" return f"{pid} ({name})" if name and name != "?" else pid def _short_file(path: str) -> str: """Use just the filename for cleaner display.""" return path.rsplit("/", 1)[-1] if "/" in path else path expected_files = {ec.file for ec in fixture.expected_classification} total = len(fixture.expected_classification) matched = 0 scored_lines: list[str] = [] info_lines: list[str] = [] # Score expected files for ec in fixture.expected_classification: actual = next((r for r in results if r["file"] == ec.file), None) fname = _short_file(ec.file) if actual is None: scored_lines.append(f" MISS {fname}") scored_lines.append(f" expected: {_proj_label(ec.project_id)}") continue pid_ok = actual["project_id"] == ec.project_id domains_ok = set(actual["domains"]) == set(ec.domains) if ec.domains else True if pid_ok and domains_ok: matched += 1 scored_lines.append(f" OK {fname}") scored_lines.append(f" project: {_proj_label(actual['project_id'])}") scored_lines.append(f" domains: {actual['domains']}") else: scored_lines.append(f" FAIL {fname}") if not pid_ok: scored_lines.append(f" project: {_proj_label(actual['project_id'])} (expected: {_proj_label(ec.project_id)})") else: scored_lines.append(f" project: {_proj_label(actual['project_id'])}") if not domains_ok: scored_lines.append(f" domains: {actual['domains']} (expected: {ec.domains})") else: scored_lines.append(f" domains: {actual['domains']}") # Show unscored files for r in results: if r["file"] not in expected_files: fname = _short_file(r["file"]) proj = _proj_label(r["project_id"], r.get("new_project_name")) info_lines.append(f" · {fname}") info_lines.append(f" project: {proj} | domains: {r['domains']}") precision = matched / total if total > 0 else 0.0 recall = precision f1 = precision parts: list[str] = [] if scored_lines: parts.append(f"Scored ({matched}/{total}):") parts.extend(scored_lines) if info_lines: parts.append(f"\nOther files ({len(info_lines) // 2}):") parts.extend(info_lines) return precision, recall, f1, "\n".join(parts) # ── Step 2 runner ───────────────────────────────────────────────────────── async def _run_step2( fixture: EvalFixture, model: str, mock: MockExecutor, ) -> None: """Run step-2 processing for each file in the fixture directory. Compiles ``_PROCESSING_SYSTEM_PROMPT`` with fixture-provided variables and runs the tool-calling loop. Mutations are captured by the mock. """ from app.agent_runner import ( _PROCESSING_SYSTEM_PROMPT, _build_processing_tools, _run_agent_with_tools, _MAX_PROCESSING_STEPS, ) from app import tracing # Compile the processing prompt with fixture variables system_prompt = tracing.compile_prompt( "batch_processing", fallback=_PROCESSING_SYSTEM_PROMPT, variables={ "existing_context": fixture.existing_context, "project_context": fixture.project_context, "data_types": ", ".join(fixture.data_types), "custom_prompt_section": fixture.custom_prompt_section, }, ) tools = _build_processing_tools(fixture.data_types) # Scan files in the fixture directory file_entries = await mock._handle( action="list_directory", data={"path": fixture.directory}, ) for entry in file_entries.get("entries", []): if entry.get("type") != "file": continue # Filter by extension if specified if fixture.file_extensions: ext = entry["name"].rsplit(".", 1)[-1] if "." in entry["name"] else "" if ext not in fixture.file_extensions: continue file_result = await mock._handle( action="read_file_content", data={"path": entry["path"]}, ) file_content: str = file_result.get("content", "") if not file_content.strip(): continue await _run_agent_with_tools( system_prompt=system_prompt, user_message=( f"Process this file and extract relevant information.\n\n" f"File: {entry['path']}\n\nContent:\n{file_content}" ), tools=tools, max_steps=_MAX_PROCESSING_STEPS, ) # ── Full runner ─────────────────────────────────────────────────────────── async def _run_full( fixture: EvalFixture, model: str, mock: MockExecutor, user_id: str, ) -> None: """Run the full two-step pipeline via ``run_local_agent``.""" from app.agent_runner import run_local_agent trigger_data: dict[str, Any] = { "type": "agent_trigger", "directory": fixture.directory, "directory_paths": [fixture.directory], "data_types": fixture.data_types, "file_extensions": fixture.file_extensions, "prompt_template": fixture.custom_prompt_section, "device_id": "eval-harness", "run_context": { "agent_id": f"eval-{fixture.name}", "run_id": None, }, } with mock.patch(): await run_local_agent(user_id, trigger_data) # ── Scoring helpers ─────────────────────────────────────────────────────── def _score_mutations( fixture: EvalFixture, mock: MockExecutor, ) -> tuple[list[FieldScore], float, float, float, int, int]: """Score mutations against expected records. Returns (field_scores, precision, recall, f1, extra, missing). """ all_field_scores: list[FieldScore] = [] total_expected = 0 total_actual = 0 total_matched = 0 total_extra = 0 total_missing = 0 expected_by_table: dict[str, list[dict]] = {} for rec in fixture.expected: expected_by_table.setdefault(rec.table, []).append(rec.fields) tables = set(expected_by_table.keys()) | {m.table for m in mock.mutations} for table in tables: expected_records = expected_by_table.get(table, []) actual_records = mock.created_records(table) + mock.updated_records(table) field_scores, extra, missing = score_field_match(expected_records, actual_records, table) all_field_scores.extend(field_scores) matched = sum(1 for s in field_scores if s.best_match is not None) total_expected += len(expected_records) total_actual += len(actual_records) total_matched += matched total_extra += extra total_missing += missing precision, recall, f1 = compute_precision_recall(total_expected, total_actual, total_matched) return all_field_scores, precision, recall, f1, total_extra, total_missing # ── Main entry point ────────────────────────────────────────────────────── async def run_single_eval( fixture: EvalFixture, model: str, *, use_llm_judge: bool = True, judge_model: str = "gpt-4o-mini", ) -> EvalScores: """Execute one eval run for a fixture + model. Mode is read from the fixture.""" from shared.config import settings from shared.ws_context import set_current_user, clear_current_user seed = copy.deepcopy(fixture.seed_records) mock = MockExecutor( fixture_dir=fixture.fixture_path.parent, seed_records=seed, ) original_model = settings.LLM_MODEL settings.LLM_MODEL = model eval_user_id = str(uuid.uuid4()) logger.info( "eval: starting %s | mode=%s | model=%s", fixture.name, fixture.mode, model, ) start_time = time.time() step1_results: list[dict[str, Any]] = [] step1_reasoning = "" try: set_current_user(eval_user_id) if fixture.mode == "step1": with mock.patch(): step1_results = await _run_step1(fixture, model, mock) elif fixture.mode == "step2": with mock.patch(): await _run_step2(fixture, model, mock) elif fixture.mode == "full": with mock.patch(): # Step 1 — classification (independent from run_local_agent) if fixture.expected_classification: step1_results = await _run_step1(fixture, model, mock) # Step 2 — full pipeline (run_local_agent handles both steps) await _run_full(fixture, model, mock, eval_user_id) except Exception as exc: logger.error("eval: pipeline failed for %s/%s: %s", fixture.name, model, exc) finally: settings.LLM_MODEL = original_model clear_current_user() elapsed = time.time() - start_time logger.info("eval: completed in %.1fs — %d mutations", elapsed, len(mock.mutations)) # ── Score ───────────────────────────────────────────────────── if fixture.mode == "step1": s1_precision, s1_recall, s1_f1, step1_reasoning = _score_step1(fixture, step1_results) scores = EvalScores( fixture_name=fixture.name, model=model, prompt_variant=fixture.mode, precision=s1_precision, recall=s1_recall, f1=s1_f1, llm_judge_reasoning=step1_reasoning, ) else: # step2 or full — score mutations field_scores, precision, recall, f1, extra, missing = _score_mutations(fixture, mock) scores = EvalScores( fixture_name=fixture.name, model=model, prompt_variant=fixture.mode, field_scores=field_scores, precision=precision, recall=recall, f1=f1, extra_records=extra, missing_records=missing, ) # Add step1 classification scores for full mode if fixture.mode == "full" and fixture.expected_classification: s1_p, s1_r, s1_f1, step1_reasoning = _score_step1(fixture, step1_results) scores.llm_judge_reasoning = f"Step1 classification:\n{step1_reasoning}" # Optional LLM judge for extraction quality if use_llm_judge and fixture.expected: all_expected = [r.fields for r in fixture.expected] all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")] judge_score, reasoning = await llm_judge_score( all_expected, all_actual, judge_model=judge_model, ) scores.llm_judge_score = judge_score if step1_reasoning: scores.llm_judge_reasoning += f"\n\nLLM judge:\n{reasoning}" else: scores.llm_judge_reasoning = reasoning # ── Report to Langfuse ──────────────────────────────────────── prompt_names = { "step1": ["batch_file_classifier"], "step2": ["batch_processing"], "full": ["batch_file_classifier", "batch_processing"], }.get(fixture.mode, ["batch_processing"]) trace_id = langfuse_eval.log_eval_trace( fixture_name=fixture.name, model=model, prompt_variant=fixture.mode, prompt_template=fixture.custom_prompt_section or "(default)", actual_mutations=[{"action": m.action, "table": m.table, "data": m.data} for m in mock.mutations], scores_summary=scores.summary(), step1_results=step1_results or None, langfuse_prompt_names=prompt_names, ) if trace_id: langfuse_eval.post_eval_scores(scores, trace_id=trace_id) # For full mode, post classification scores separately if fixture.mode == "full" and fixture.expected_classification: s1_p, s1_r, s1_f1, _ = _score_step1(fixture, step1_results) for name, value in [ ("classification_precision", s1_p), ("classification_recall", s1_r), ("classification_f1", s1_f1), ]: try: from langfuse import get_client lf = get_client() if lf: lf.create_score( name=name, value=value, trace_id=trace_id, data_type="NUMERIC", comment=f"{fixture.name} | {model} | full", ) except Exception: pass return scores async def run_fixture_eval( fixture: EvalFixture, models: list[str], *, use_llm_judge: bool = True, judge_model: str = "gpt-4o-mini", ) -> list[EvalScores]: """Run all models for a fixture.""" langfuse_eval.sync_fixture_to_dataset(fixture) results: list[EvalScores] = [] for model in models: scores = await run_single_eval( fixture, model, use_llm_judge=use_llm_judge, judge_model=judge_model, ) results.append(scores) return results def print_results(results: list[EvalScores]) -> None: """Print a formatted summary table of eval results.""" if not results: print("\nNo eval results.") return W = 90 print("\n" + "=" * W) print(f"{'Fixture':<25} {'Mode':<6} {'Model':<25} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}") print("-" * W) for s in results: llm_str = f"{s.llm_judge_score:.2f}" if s.llm_judge_score is not None else " --" fa_str = f"{s.field_accuracy:.2f}" if s.field_scores else " --" print( f"{s.fixture_name:<25} {s.prompt_variant:<6} {s.model:<25} " f"{s.precision:>6.2f} {s.recall:>6.2f} {s.f1:>6.2f} " f"{fa_str:>6} {llm_str:>6}" ) print("=" * W) for s in results: if s.llm_judge_reasoning: print(f"\n{'─' * W}") print(f" {s.fixture_name} | {s.model} | {s.prompt_variant}") print(f"{'─' * W}") print(s.llm_judge_reasoning) print()