api/services/batch-agent/eval/runner.py

"""Eval runner — orchestrates fixture → mock → agent pipeline → scoring.

Supports three eval modes:

- **step1**: Test classification prompt only (``_STEP1_SYSTEM_PROMPT``).
  Calls the LLM with fixture-provided ``domain_definitions`` and
  ``projects_list`` and compares output against ``expected_classification``.

- **step2**: Test processing prompt only (``_PROCESSING_SYSTEM_PROMPT``).
  Compiles the prompt with fixture-provided ``existing_context``,
  ``project_context``, ``data_types``, and ``custom_prompt_section``,
  then runs the tool-calling loop.  Mutations are scored against
  ``expected`` records.

- **full**: Run ``run_local_agent()`` end-to-end (both steps).
  Scored on both classification and extraction.
"""

from __future__ import annotations

import copy
import json
import logging
import time
import uuid
from typing import Any

from eval.config import EvalFixture, ExpectedClassification
from eval.mock_executor import MockExecutor
from eval.scorer import (
    EvalScores,
    FieldScore,
    compute_precision_recall,
    llm_judge_score,
    score_field_match,
)
from eval import langfuse_eval

logger = logging.getLogger(__name__)


# ── Step 1 runner ─────────────────────────────────────────────────────────


async def _run_step1(
    fixture: EvalFixture,
    model: str,
    mock: MockExecutor,
) -> list[dict[str, Any]]:
    """Run step-1 classification for every file in the fixture directory.

    Scans the directory recursively, classifies each file, and returns
    a list of result dicts:
    ``[{file, project_id, domains, new_project_name}, ...]``
    """
    from app.agent_runner import _classify_file

    # Build project name lookup for display
    proj_names: dict[str, str] = {
        p.get("id", ""): p.get("name", "") for p in fixture.projects_list
    }

    # Discover all files in the fixture directory
    all_files = await _scan_fixture_files(mock, fixture.directory)
    print(f"\n  Scanning {len(all_files)} files in {fixture.directory}\n")

    results: list[dict[str, Any]] = []
    for i, file_path in enumerate(all_files, 1):
        file_result = await mock._handle(
            action="read_file_content",
            data={"path": file_path},
        )
        file_content: str = file_result.get("content", "")
        if not file_content.strip():
            continue

        project_id, domains, new_name = await _classify_file(
            file_path=file_path,
            file_content=file_content,
            projects=fixture.projects_list,
            config_data_types=fixture.data_types,
            custom_system_prompt=fixture.custom_step1_prompt or None,
        )

        short_name = file_path.rsplit("/", 1)[-1] if "/" in file_path else file_path
        proj_label = proj_names.get(project_id, new_name or "?")
        print(f"  [{i}/{len(all_files)}] {short_name}  →  {project_id} ({proj_label})  {domains}")

        results.append({
            "file": file_path,
            "project_id": project_id,
            "domains": domains,
            "new_project_name": new_name,
        })
    return results


async def _scan_fixture_files(mock: MockExecutor, directory: str) -> list[str]:
    """Recursively list all files under *directory* via the mock executor."""
    files: list[str] = []

    async def _walk(path: str) -> None:
        result = await mock._handle(action="list_directory", data={"path": path})
        for entry in result.get("entries", []):
            if entry.get("type") == "directory":
                await _walk(entry["path"])
            elif entry.get("type") == "file":
                files.append(entry["path"])

    await _walk(directory)
    return sorted(files)


def _score_step1(
    fixture: EvalFixture,
    results: list[dict[str, Any]],
) -> tuple[float, float, float, str]:
    """Score step-1 results. Returns (precision, recall, f1, reasoning).

    Files with expected classifications are scored (OK/FAIL).
    Files without expectations are shown as informational (INFO).
    """
    if not fixture.expected_classification:
        return 0.0, 0.0, 0.0, "No expected classifications"

    # Build project name lookup
    proj_names: dict[str, str] = {
        p.get("id", ""): p.get("name", "") for p in fixture.projects_list
    }
    proj_names["new"] = "(new project)"

    def _proj_label(pid: str, new_name: str | None = None) -> str:
        name = proj_names.get(pid, "?")
        if pid == "new" and new_name:
            return f"new → \"{new_name}\""
        return f"{pid} ({name})" if name and name != "?" else pid

    def _short_file(path: str) -> str:
        """Use just the filename for cleaner display."""
        return path.rsplit("/", 1)[-1] if "/" in path else path

    expected_files = {ec.file for ec in fixture.expected_classification}
    total = len(fixture.expected_classification)
    matched = 0

    scored_lines: list[str] = []
    info_lines: list[str] = []

    # Score expected files
    for ec in fixture.expected_classification:
        actual = next((r for r in results if r["file"] == ec.file), None)
        fname = _short_file(ec.file)
        if actual is None:
            scored_lines.append(f"  MISS  {fname}")
            scored_lines.append(f"          expected: {_proj_label(ec.project_id)}")
            continue

        pid_ok = actual["project_id"] == ec.project_id
        domains_ok = set(actual["domains"]) == set(ec.domains) if ec.domains else True

        if pid_ok and domains_ok:
            matched += 1
            scored_lines.append(f"  OK    {fname}")
            scored_lines.append(f"          project: {_proj_label(actual['project_id'])}")
            scored_lines.append(f"          domains: {actual['domains']}")
        else:
            scored_lines.append(f"  FAIL  {fname}")
            if not pid_ok:
                scored_lines.append(f"          project: {_proj_label(actual['project_id'])}  (expected: {_proj_label(ec.project_id)})")
            else:
                scored_lines.append(f"          project: {_proj_label(actual['project_id'])}")
            if not domains_ok:
                scored_lines.append(f"          domains: {actual['domains']}  (expected: {ec.domains})")
            else:
                scored_lines.append(f"          domains: {actual['domains']}")

    # Show unscored files
    for r in results:
        if r["file"] not in expected_files:
            fname = _short_file(r["file"])
            proj = _proj_label(r["project_id"], r.get("new_project_name"))
            info_lines.append(f"  ·     {fname}")
            info_lines.append(f"          project: {proj}  |  domains: {r['domains']}")

    precision = matched / total if total > 0 else 0.0
    recall = precision
    f1 = precision

    parts: list[str] = []
    if scored_lines:
        parts.append(f"Scored ({matched}/{total}):")
        parts.extend(scored_lines)
    if info_lines:
        parts.append(f"\nOther files ({len(info_lines) // 2}):")
        parts.extend(info_lines)

    return precision, recall, f1, "\n".join(parts)


# ── Step 2 runner ─────────────────────────────────────────────────────────


async def _run_step2(
    fixture: EvalFixture,
    model: str,
    mock: MockExecutor,
) -> None:
    """Run step-2 processing for each file in the fixture directory.

    Compiles ``_PROCESSING_SYSTEM_PROMPT`` with fixture-provided variables
    and runs the tool-calling loop.  Mutations are captured by the mock.
    """
    from app.agent_runner import (
        _PROCESSING_SYSTEM_PROMPT,
        _build_processing_tools,
        _run_agent_with_tools,
        _MAX_PROCESSING_STEPS,
    )
    from app import tracing

    # Compile the processing prompt with fixture variables
    system_prompt = tracing.compile_prompt(
        "batch_processing",
        fallback=_PROCESSING_SYSTEM_PROMPT,
        variables={
            "existing_context": fixture.existing_context,
            "project_context": fixture.project_context,
            "data_types": ", ".join(fixture.data_types),
            "custom_prompt_section": fixture.custom_prompt_section,
        },
    )

    tools = _build_processing_tools(fixture.data_types)

    # Scan files in the fixture directory
    file_entries = await mock._handle(
        action="list_directory",
        data={"path": fixture.directory},
    )
    for entry in file_entries.get("entries", []):
        if entry.get("type") != "file":
            continue
        # Filter by extension if specified
        if fixture.file_extensions:
            ext = entry["name"].rsplit(".", 1)[-1] if "." in entry["name"] else ""
            if ext not in fixture.file_extensions:
                continue

        file_result = await mock._handle(
            action="read_file_content",
            data={"path": entry["path"]},
        )
        file_content: str = file_result.get("content", "")
        if not file_content.strip():
            continue

        await _run_agent_with_tools(
            system_prompt=system_prompt,
            user_message=(
                f"Process this file and extract relevant information.\n\n"
                f"File: {entry['path']}\n\nContent:\n{file_content}"
            ),
            tools=tools,
            max_steps=_MAX_PROCESSING_STEPS,
        )


# ── Full runner ───────────────────────────────────────────────────────────


async def _run_full(
    fixture: EvalFixture,
    model: str,
    mock: MockExecutor,
    user_id: str,
) -> None:
    """Run the full two-step pipeline via ``run_local_agent``."""
    from app.agent_runner import run_local_agent

    trigger_data: dict[str, Any] = {
        "type": "agent_trigger",
        "directory": fixture.directory,
        "directory_paths": [fixture.directory],
        "data_types": fixture.data_types,
        "file_extensions": fixture.file_extensions,
        "prompt_template": fixture.custom_prompt_section,
        "device_id": "eval-harness",
        "run_context": {
            "agent_id": f"eval-{fixture.name}",
            "run_id": None,
        },
    }

    with mock.patch():
        await run_local_agent(user_id, trigger_data)


# ── Scoring helpers ───────────────────────────────────────────────────────


def _score_mutations(
    fixture: EvalFixture,
    mock: MockExecutor,
) -> tuple[list[FieldScore], float, float, float, int, int]:
    """Score mutations against expected records.

    Returns (field_scores, precision, recall, f1, extra, missing).
    """
    all_field_scores: list[FieldScore] = []
    total_expected = 0
    total_actual = 0
    total_matched = 0
    total_extra = 0
    total_missing = 0

    expected_by_table: dict[str, list[dict]] = {}
    for rec in fixture.expected:
        expected_by_table.setdefault(rec.table, []).append(rec.fields)

    tables = set(expected_by_table.keys()) | {m.table for m in mock.mutations}
    for table in tables:
        expected_records = expected_by_table.get(table, [])
        actual_records = mock.created_records(table) + mock.updated_records(table)

        field_scores, extra, missing = score_field_match(expected_records, actual_records, table)
        all_field_scores.extend(field_scores)

        matched = sum(1 for s in field_scores if s.best_match is not None)
        total_expected += len(expected_records)
        total_actual += len(actual_records)
        total_matched += matched
        total_extra += extra
        total_missing += missing

    precision, recall, f1 = compute_precision_recall(total_expected, total_actual, total_matched)
    return all_field_scores, precision, recall, f1, total_extra, total_missing


# ── Main entry point ──────────────────────────────────────────────────────


async def run_single_eval(
    fixture: EvalFixture,
    model: str,
    *,
    use_llm_judge: bool = True,
    judge_model: str = "gpt-4o-mini",
) -> EvalScores:
    """Execute one eval run for a fixture + model.  Mode is read from the fixture."""
    from shared.config import settings
    from shared.ws_context import set_current_user, clear_current_user

    seed = copy.deepcopy(fixture.seed_records)
    mock = MockExecutor(
        fixture_dir=fixture.fixture_path.parent,
        seed_records=seed,
    )

    original_model = settings.LLM_MODEL
    settings.LLM_MODEL = model
    eval_user_id = str(uuid.uuid4())

    logger.info(
        "eval: starting %s | mode=%s | model=%s",
        fixture.name, fixture.mode, model,
    )
    start_time = time.time()

    step1_results: list[dict[str, Any]] = []
    step1_reasoning = ""

    try:
        set_current_user(eval_user_id)

        if fixture.mode == "step1":
            with mock.patch():
                step1_results = await _run_step1(fixture, model, mock)

        elif fixture.mode == "step2":
            with mock.patch():
                await _run_step2(fixture, model, mock)

        elif fixture.mode == "full":
            with mock.patch():
                # Step 1 — classification (independent from run_local_agent)
                if fixture.expected_classification:
                    step1_results = await _run_step1(fixture, model, mock)

            # Step 2 — full pipeline (run_local_agent handles both steps)
            await _run_full(fixture, model, mock, eval_user_id)

    except Exception as exc:
        logger.error("eval: pipeline failed for %s/%s: %s", fixture.name, model, exc)
    finally:
        settings.LLM_MODEL = original_model
        clear_current_user()

    elapsed = time.time() - start_time
    logger.info("eval: completed in %.1fs — %d mutations", elapsed, len(mock.mutations))

    # ── Score ─────────────────────────────────────────────────────

    if fixture.mode == "step1":
        s1_precision, s1_recall, s1_f1, step1_reasoning = _score_step1(fixture, step1_results)
        scores = EvalScores(
            fixture_name=fixture.name,
            model=model,
            prompt_variant=fixture.mode,
            precision=s1_precision,
            recall=s1_recall,
            f1=s1_f1,
            llm_judge_reasoning=step1_reasoning,
        )
    else:
        # step2 or full — score mutations
        field_scores, precision, recall, f1, extra, missing = _score_mutations(fixture, mock)
        scores = EvalScores(
            fixture_name=fixture.name,
            model=model,
            prompt_variant=fixture.mode,
            field_scores=field_scores,
            precision=precision,
            recall=recall,
            f1=f1,
            extra_records=extra,
            missing_records=missing,
        )

        # Add step1 classification scores for full mode
        if fixture.mode == "full" and fixture.expected_classification:
            s1_p, s1_r, s1_f1, step1_reasoning = _score_step1(fixture, step1_results)
            scores.llm_judge_reasoning = f"Step1 classification:\n{step1_reasoning}"

        # Optional LLM judge for extraction quality
        if use_llm_judge and fixture.expected:
            all_expected = [r.fields for r in fixture.expected]
            all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")]
            judge_score, reasoning = await llm_judge_score(
                all_expected, all_actual, judge_model=judge_model,
            )
            scores.llm_judge_score = judge_score
            if step1_reasoning:
                scores.llm_judge_reasoning += f"\n\nLLM judge:\n{reasoning}"
            else:
                scores.llm_judge_reasoning = reasoning

    # ── Report to Langfuse ────────────────────────────────────────
    prompt_names = {
        "step1": ["batch_file_classifier"],
        "step2": ["batch_processing"],
        "full": ["batch_file_classifier", "batch_processing"],
    }.get(fixture.mode, ["batch_processing"])

    trace_id = langfuse_eval.log_eval_trace(
        fixture_name=fixture.name,
        model=model,
        prompt_variant=fixture.mode,
        prompt_template=fixture.custom_prompt_section or "(default)",
        actual_mutations=[{"action": m.action, "table": m.table, "data": m.data} for m in mock.mutations],
        scores_summary=scores.summary(),
        step1_results=step1_results or None,
        langfuse_prompt_names=prompt_names,
    )

    if trace_id:
        langfuse_eval.post_eval_scores(scores, trace_id=trace_id)

        # For full mode, post classification scores separately
        if fixture.mode == "full" and fixture.expected_classification:
            s1_p, s1_r, s1_f1, _ = _score_step1(fixture, step1_results)
            for name, value in [
                ("classification_precision", s1_p),
                ("classification_recall", s1_r),
                ("classification_f1", s1_f1),
            ]:
                try:
                    from langfuse import get_client
                    lf = get_client()
                    if lf:
                        lf.create_score(
                            name=name,
                            value=value,
                            trace_id=trace_id,
                            data_type="NUMERIC",
                            comment=f"{fixture.name} | {model} | full",
                        )
                except Exception:
                    pass

    return scores


async def run_fixture_eval(
    fixture: EvalFixture,
    models: list[str],
    *,
    use_llm_judge: bool = True,
    judge_model: str = "gpt-4o-mini",
) -> list[EvalScores]:
    """Run all models for a fixture."""
    langfuse_eval.sync_fixture_to_dataset(fixture)

    results: list[EvalScores] = []
    for model in models:
        scores = await run_single_eval(
            fixture, model,
            use_llm_judge=use_llm_judge,
            judge_model=judge_model,
        )
        results.append(scores)

    return results


def print_results(results: list[EvalScores]) -> None:
    """Print a formatted summary table of eval results."""
    if not results:
        print("\nNo eval results.")
        return

    W = 90

    print("\n" + "=" * W)
    print(f"{'Fixture':<25} {'Mode':<6} {'Model':<25} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}")
    print("-" * W)

    for s in results:
        llm_str = f"{s.llm_judge_score:.2f}" if s.llm_judge_score is not None else "  --"
        fa_str = f"{s.field_accuracy:.2f}" if s.field_scores else "  --"
        print(
            f"{s.fixture_name:<25} {s.prompt_variant:<6} {s.model:<25} "
            f"{s.precision:>6.2f} {s.recall:>6.2f} {s.f1:>6.2f} "
            f"{fa_str:>6} {llm_str:>6}"
        )

    print("=" * W)

    for s in results:
        if s.llm_judge_reasoning:
            print(f"\n{'─' * W}")
            print(f"  {s.fixture_name}  |  {s.model}  |  {s.prompt_variant}")
            print(f"{'─' * W}")
            print(s.llm_judge_reasoning)

    print()