"""Eval runner — orchestrates fixture → mock → agent pipeline → scoring. Supports three eval modes: - **step1**: Test classification prompt only (``_STEP1_SYSTEM_PROMPT``). Calls the LLM with fixture-provided ``domain_definitions`` and ``projects_list`` and compares output against ``expected_classification``. - **step2**: Test processing prompt only (``_PROCESSING_SYSTEM_PROMPT``). Compiles the prompt with fixture-provided ``existing_context``, ``project_context``, ``data_types``, and ``custom_prompt_section``, then runs the tool-calling loop. Mutations are scored against ``expected`` records. - **full**: Run ``run_local_agent()`` end-to-end (both steps). Scored on both classification and extraction. """ from __future__ import annotations import copy import json import logging import time import uuid from typing import Any from eval.config import EvalFixture, ExpectedClassification from eval.mock_executor import MockExecutor from eval.scorer import ( EvalScores, FieldScore, compute_precision_recall, llm_judge_score, score_field_match, ) from eval import langfuse_eval logger = logging.getLogger(__name__) # ── Step 1 runner ───────────────────────────────────────────────────────── async def _run_step1( fixture: EvalFixture, model: str, mock: MockExecutor, ) -> list[dict[str, Any]]: """Run step-1 classification for each expected file. Returns a list of result dicts: ``[{file, project_id, domains, new_project_name}, ...]`` """ from app.agent_runner import _classify_file results: list[dict[str, Any]] = [] for ec in fixture.expected_classification: # Read the file content through the mock file_result = await mock._handle( action="read_file_content", data={"path": ec.file}, ) file_content: str = file_result.get("content", "") project_id, domains, new_name = await _classify_file( file_path=ec.file, file_content=file_content, projects=fixture.projects_list, config_data_types=fixture.data_types, ) results.append({ "file": ec.file, "project_id": project_id, "domains": domains, "new_project_name": new_name, }) return results def _score_step1( fixture: EvalFixture, results: list[dict[str, Any]], ) -> tuple[float, float, float, str]: """Score step-1 results. Returns (precision, recall, f1, reasoning).""" if not fixture.expected_classification: return 0.0, 0.0, 0.0, "No expected classifications" total = len(fixture.expected_classification) matched = 0 details: list[str] = [] for ec in fixture.expected_classification: actual = next((r for r in results if r["file"] == ec.file), None) if actual is None: details.append(f" MISS {ec.file}: not processed") continue pid_ok = actual["project_id"] == ec.project_id domains_ok = set(actual["domains"]) == set(ec.domains) if ec.domains else True if pid_ok and domains_ok: matched += 1 details.append(f" OK {ec.file}: project={actual['project_id']}, domains={actual['domains']}") else: parts: list[str] = [] if not pid_ok: parts.append(f"project expected={ec.project_id} got={actual['project_id']}") if not domains_ok: parts.append(f"domains expected={ec.domains} got={actual['domains']}") details.append(f" FAIL {ec.file}: {'; '.join(parts)}") precision = matched / total if total > 0 else 0.0 recall = precision # in step1, precision == recall (same denominator) f1 = precision # same reasoning = "\n".join(details) return precision, recall, f1, reasoning # ── Step 2 runner ───────────────────────────────────────────────────────── async def _run_step2( fixture: EvalFixture, model: str, mock: MockExecutor, ) -> None: """Run step-2 processing for each file in the fixture directory. Compiles ``_PROCESSING_SYSTEM_PROMPT`` with fixture-provided variables and runs the tool-calling loop. Mutations are captured by the mock. """ from app.agent_runner import ( _PROCESSING_SYSTEM_PROMPT, _build_processing_tools, _run_agent_with_tools, _MAX_PROCESSING_STEPS, ) from app import tracing # Compile the processing prompt with fixture variables system_prompt = tracing.compile_prompt( "batch_processing", fallback=_PROCESSING_SYSTEM_PROMPT, variables={ "existing_context": fixture.existing_context, "project_context": fixture.project_context, "data_types": ", ".join(fixture.data_types), "custom_prompt_section": fixture.custom_prompt_section, }, ) tools = _build_processing_tools(fixture.data_types) # Scan files in the fixture directory file_entries = await mock._handle( action="list_directory", data={"path": fixture.directory}, ) for entry in file_entries.get("entries", []): if entry.get("type") != "file": continue # Filter by extension if specified if fixture.file_extensions: ext = entry["name"].rsplit(".", 1)[-1] if "." in entry["name"] else "" if ext not in fixture.file_extensions: continue file_result = await mock._handle( action="read_file_content", data={"path": entry["path"]}, ) file_content: str = file_result.get("content", "") if not file_content.strip(): continue await _run_agent_with_tools( system_prompt=system_prompt, user_message=( f"Process this file and extract relevant information.\n\n" f"File: {entry['path']}\n\nContent:\n{file_content}" ), tools=tools, max_steps=_MAX_PROCESSING_STEPS, ) # ── Full runner ─────────────────────────────────────────────────────────── async def _run_full( fixture: EvalFixture, model: str, mock: MockExecutor, user_id: str, ) -> None: """Run the full two-step pipeline via ``run_local_agent``.""" from app.agent_runner import run_local_agent trigger_data: dict[str, Any] = { "type": "agent_trigger", "directory": fixture.directory, "directory_paths": [fixture.directory], "data_types": fixture.data_types, "file_extensions": fixture.file_extensions, "prompt_template": fixture.custom_prompt_section, "device_id": "eval-harness", "run_context": { "agent_id": f"eval-{fixture.name}", "run_id": None, }, } with mock.patch(): await run_local_agent(user_id, trigger_data) # ── Scoring helpers ─────────────────────────────────────────────────────── def _score_mutations( fixture: EvalFixture, mock: MockExecutor, ) -> tuple[list[FieldScore], float, float, float, int, int]: """Score mutations against expected records. Returns (field_scores, precision, recall, f1, extra, missing). """ all_field_scores: list[FieldScore] = [] total_expected = 0 total_actual = 0 total_matched = 0 total_extra = 0 total_missing = 0 expected_by_table: dict[str, list[dict]] = {} for rec in fixture.expected: expected_by_table.setdefault(rec.table, []).append(rec.fields) tables = set(expected_by_table.keys()) | {m.table for m in mock.mutations} for table in tables: expected_records = expected_by_table.get(table, []) actual_records = mock.created_records(table) + mock.updated_records(table) field_scores, extra, missing = score_field_match(expected_records, actual_records, table) all_field_scores.extend(field_scores) matched = sum(1 for s in field_scores if s.best_match is not None) total_expected += len(expected_records) total_actual += len(actual_records) total_matched += matched total_extra += extra total_missing += missing precision, recall, f1 = compute_precision_recall(total_expected, total_actual, total_matched) return all_field_scores, precision, recall, f1, total_extra, total_missing # ── Main entry point ────────────────────────────────────────────────────── async def run_single_eval( fixture: EvalFixture, model: str, *, use_llm_judge: bool = True, judge_model: str = "gpt-4o-mini", ) -> EvalScores: """Execute one eval run for a fixture + model. Mode is read from the fixture.""" from shared.config import settings from shared.ws_context import set_current_user, clear_current_user seed = copy.deepcopy(fixture.seed_records) mock = MockExecutor( fixture_dir=fixture.fixture_path.parent, seed_records=seed, ) original_model = settings.LLM_MODEL settings.LLM_MODEL = model eval_user_id = str(uuid.uuid4()) logger.info( "eval: starting %s | mode=%s | model=%s", fixture.name, fixture.mode, model, ) start_time = time.time() step1_results: list[dict[str, Any]] = [] step1_reasoning = "" try: set_current_user(eval_user_id) if fixture.mode == "step1": with mock.patch(): step1_results = await _run_step1(fixture, model, mock) elif fixture.mode == "step2": with mock.patch(): await _run_step2(fixture, model, mock) elif fixture.mode == "full": with mock.patch(): # Step 1 — classification (independent from run_local_agent) if fixture.expected_classification: step1_results = await _run_step1(fixture, model, mock) # Step 2 — full pipeline (run_local_agent handles both steps) await _run_full(fixture, model, mock, eval_user_id) except Exception as exc: logger.error("eval: pipeline failed for %s/%s: %s", fixture.name, model, exc) finally: settings.LLM_MODEL = original_model clear_current_user() elapsed = time.time() - start_time logger.info("eval: completed in %.1fs — %d mutations", elapsed, len(mock.mutations)) # ── Score ───────────────────────────────────────────────────── if fixture.mode == "step1": s1_precision, s1_recall, s1_f1, step1_reasoning = _score_step1(fixture, step1_results) scores = EvalScores( fixture_name=fixture.name, model=model, prompt_variant=fixture.mode, precision=s1_precision, recall=s1_recall, f1=s1_f1, llm_judge_reasoning=step1_reasoning, ) else: # step2 or full — score mutations field_scores, precision, recall, f1, extra, missing = _score_mutations(fixture, mock) scores = EvalScores( fixture_name=fixture.name, model=model, prompt_variant=fixture.mode, field_scores=field_scores, precision=precision, recall=recall, f1=f1, extra_records=extra, missing_records=missing, ) # Add step1 classification scores for full mode if fixture.mode == "full" and fixture.expected_classification: s1_p, s1_r, s1_f1, step1_reasoning = _score_step1(fixture, step1_results) scores.llm_judge_reasoning = f"Step1 classification:\n{step1_reasoning}" # Optional LLM judge for extraction quality if use_llm_judge and fixture.expected: all_expected = [r.fields for r in fixture.expected] all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")] judge_score, reasoning = await llm_judge_score( all_expected, all_actual, judge_model=judge_model, ) scores.llm_judge_score = judge_score if step1_reasoning: scores.llm_judge_reasoning += f"\n\nLLM judge:\n{reasoning}" else: scores.llm_judge_reasoning = reasoning # ── Report to Langfuse ──────────────────────────────────────── prompt_names = { "step1": ["batch_file_classifier"], "step2": ["batch_processing"], "full": ["batch_file_classifier", "batch_processing"], }.get(fixture.mode, ["batch_processing"]) trace_id = langfuse_eval.log_eval_trace( fixture_name=fixture.name, model=model, prompt_variant=fixture.mode, prompt_template=fixture.custom_prompt_section or "(default)", actual_mutations=[{"action": m.action, "table": m.table, "data": m.data} for m in mock.mutations], scores_summary=scores.summary(), step1_results=step1_results or None, langfuse_prompt_names=prompt_names, ) if trace_id: langfuse_eval.post_eval_scores(scores, trace_id=trace_id) # For full mode, post classification scores separately if fixture.mode == "full" and fixture.expected_classification: s1_p, s1_r, s1_f1, _ = _score_step1(fixture, step1_results) for name, value in [ ("classification_precision", s1_p), ("classification_recall", s1_r), ("classification_f1", s1_f1), ]: try: from langfuse import get_client lf = get_client() if lf: lf.create_score( name=name, value=value, trace_id=trace_id, data_type="NUMERIC", comment=f"{fixture.name} | {model} | full", ) except Exception: pass return scores async def run_fixture_eval( fixture: EvalFixture, models: list[str], *, use_llm_judge: bool = True, judge_model: str = "gpt-4o-mini", ) -> list[EvalScores]: """Run all models for a fixture.""" langfuse_eval.sync_fixture_to_dataset(fixture) results: list[EvalScores] = [] for model in models: scores = await run_single_eval( fixture, model, use_llm_judge=use_llm_judge, judge_model=judge_model, ) results.append(scores) return results def print_results(results: list[EvalScores]) -> None: """Print a formatted summary table of eval results.""" if not results: print("\nNo eval results.") return print("\n" + "=" * 95) print(f"{'Fixture':<25} {'Mode':<6} {'Model':<25} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}") print("-" * 95) for s in results: llm_str = f"{s.llm_judge_score:.2f}" if s.llm_judge_score is not None else " --" print( f"{s.fixture_name:<25} {s.prompt_variant:<6} {s.model:<25} " f"{s.precision:>6.2f} {s.recall:>6.2f} {s.f1:>6.2f} " f"{s.field_accuracy:>6.2f} {llm_str:>6}" ) print("=" * 95) print() print("=" * 90) # If LLM judge reasoning is available, print it for s in results: if s.llm_judge_reasoning: print(f"\n[{s.model} / {s.prompt_variant}] LLM Judge: {s.llm_judge_reasoning}") print()