"""Eval runner — orchestrates fixture → mock → agent pipeline → scoring.

Supports three eval modes:

- **step1**: Test classification prompt only (``_STEP1_SYSTEM_PROMPT``).
  Calls the LLM with fixture-provided ``domain_definitions`` and
  ``projects_list`` and compares output against ``expected_classification``.

- **step2**: Test processing prompt only (``_PROCESSING_SYSTEM_PROMPT``).
  Compiles the prompt with fixture-provided ``existing_context``,
  ``project_context``, ``data_types``, and ``custom_prompt_section``,
  then runs the tool-calling loop.  Mutations are scored against
  ``expected`` records.

- **full**: Run ``run_local_agent()`` end-to-end (both steps).
  Scored on both classification and extraction.
"""

from __future__ import annotations

import copy
import json
import logging
import time
import uuid
from typing import Any

from eval.config import EvalFixture, ExpectedClassification
from eval.mock_executor import MockExecutor
from eval.scorer import (
    EvalScores,
    FieldScore,
    compute_precision_recall,
    llm_judge_score,
    score_field_match,
)
from eval import langfuse_eval

logger = logging.getLogger(__name__)


# ── Step 1 runner ─────────────────────────────────────────────────────────


async def _run_step1(
    fixture: EvalFixture,
    model: str,
    mock: MockExecutor,
) -> list[dict[str, Any]]:
    """Run step-1 classification for each expected file.

    Returns a list of result dicts:
    ``[{file, project_id, domains, new_project_name}, ...]``
    """
    from app.agent_runner import _classify_file

    results: list[dict[str, Any]] = []
    for ec in fixture.expected_classification:
        # Read the file content through the mock
        file_result = await mock._handle(
            action="read_file_content",
            data={"path": ec.file},
        )
        file_content: str = file_result.get("content", "")

        project_id, domains, new_name = await _classify_file(
            file_path=ec.file,
            file_content=file_content,
            projects=fixture.projects_list,
            config_data_types=fixture.data_types,
        )
        results.append({
            "file": ec.file,
            "project_id": project_id,
            "domains": domains,
            "new_project_name": new_name,
        })
    return results


def _score_step1(
    fixture: EvalFixture,
    results: list[dict[str, Any]],
) -> tuple[float, float, float, str]:
    """Score step-1 results. Returns (precision, recall, f1, reasoning)."""
    if not fixture.expected_classification:
        return 0.0, 0.0, 0.0, "No expected classifications"

    total = len(fixture.expected_classification)
    matched = 0
    details: list[str] = []

    for ec in fixture.expected_classification:
        actual = next((r for r in results if r["file"] == ec.file), None)
        if actual is None:
            details.append(f"  MISS {ec.file}: not processed")
            continue

        pid_ok = actual["project_id"] == ec.project_id
        domains_ok = set(actual["domains"]) == set(ec.domains) if ec.domains else True

        if pid_ok and domains_ok:
            matched += 1
            details.append(f"  OK   {ec.file}: project={actual['project_id']}, domains={actual['domains']}")
        else:
            parts: list[str] = []
            if not pid_ok:
                parts.append(f"project expected={ec.project_id} got={actual['project_id']}")
            if not domains_ok:
                parts.append(f"domains expected={ec.domains} got={actual['domains']}")
            details.append(f"  FAIL {ec.file}: {'; '.join(parts)}")

    precision = matched / total if total > 0 else 0.0
    recall = precision  # in step1, precision == recall (same denominator)
    f1 = precision  # same
    reasoning = "\n".join(details)
    return precision, recall, f1, reasoning


# ── Step 2 runner ─────────────────────────────────────────────────────────


async def _run_step2(
    fixture: EvalFixture,
    model: str,
    mock: MockExecutor,
) -> None:
    """Run step-2 processing for each file in the fixture directory.

    Compiles ``_PROCESSING_SYSTEM_PROMPT`` with fixture-provided variables
    and runs the tool-calling loop.  Mutations are captured by the mock.
    """
    from app.agent_runner import (
        _PROCESSING_SYSTEM_PROMPT,
        _build_processing_tools,
        _run_agent_with_tools,
        _MAX_PROCESSING_STEPS,
    )
    from app import tracing

    # Compile the processing prompt with fixture variables
    system_prompt = tracing.compile_prompt(
        "batch_processing",
        fallback=_PROCESSING_SYSTEM_PROMPT,
        variables={
            "existing_context": fixture.existing_context,
            "project_context": fixture.project_context,
            "data_types": ", ".join(fixture.data_types),
            "custom_prompt_section": fixture.custom_prompt_section,
        },
    )

    tools = _build_processing_tools(fixture.data_types)

    # Scan files in the fixture directory
    file_entries = await mock._handle(
        action="list_directory",
        data={"path": fixture.directory},
    )
    for entry in file_entries.get("entries", []):
        if entry.get("type") != "file":
            continue
        # Filter by extension if specified
        if fixture.file_extensions:
            ext = entry["name"].rsplit(".", 1)[-1] if "." in entry["name"] else ""
            if ext not in fixture.file_extensions:
                continue

        file_result = await mock._handle(
            action="read_file_content",
            data={"path": entry["path"]},
        )
        file_content: str = file_result.get("content", "")
        if not file_content.strip():
            continue

        await _run_agent_with_tools(
            system_prompt=system_prompt,
            user_message=(
                f"Process this file and extract relevant information.\n\n"
                f"File: {entry['path']}\n\nContent:\n{file_content}"
            ),
            tools=tools,
            max_steps=_MAX_PROCESSING_STEPS,
        )


# ── Full runner ───────────────────────────────────────────────────────────


async def _run_full(
    fixture: EvalFixture,
    model: str,
    mock: MockExecutor,
    user_id: str,
) -> None:
    """Run the full two-step pipeline via ``run_local_agent``."""
    from app.agent_runner import run_local_agent

    trigger_data: dict[str, Any] = {
        "type": "agent_trigger",
        "directory": fixture.directory,
        "directory_paths": [fixture.directory],
        "data_types": fixture.data_types,
        "file_extensions": fixture.file_extensions,
        "prompt_template": fixture.custom_prompt_section,
        "device_id": "eval-harness",
        "run_context": {
            "agent_id": f"eval-{fixture.name}",
            "run_id": None,
        },
    }

    with mock.patch():
        await run_local_agent(user_id, trigger_data)


# ── Scoring helpers ───────────────────────────────────────────────────────


def _score_mutations(
    fixture: EvalFixture,
    mock: MockExecutor,
) -> tuple[list[FieldScore], float, float, float, int, int]:
    """Score mutations against expected records.

    Returns (field_scores, precision, recall, f1, extra, missing).
    """
    all_field_scores: list[FieldScore] = []
    total_expected = 0
    total_actual = 0
    total_matched = 0
    total_extra = 0
    total_missing = 0

    expected_by_table: dict[str, list[dict]] = {}
    for rec in fixture.expected:
        expected_by_table.setdefault(rec.table, []).append(rec.fields)

    tables = set(expected_by_table.keys()) | {m.table for m in mock.mutations}
    for table in tables:
        expected_records = expected_by_table.get(table, [])
        actual_records = mock.created_records(table) + mock.updated_records(table)

        field_scores, extra, missing = score_field_match(expected_records, actual_records, table)
        all_field_scores.extend(field_scores)

        matched = sum(1 for s in field_scores if s.best_match is not None)
        total_expected += len(expected_records)
        total_actual += len(actual_records)
        total_matched += matched
        total_extra += extra
        total_missing += missing

    precision, recall, f1 = compute_precision_recall(total_expected, total_actual, total_matched)
    return all_field_scores, precision, recall, f1, total_extra, total_missing


# ── Main entry point ──────────────────────────────────────────────────────


async def run_single_eval(
    fixture: EvalFixture,
    model: str,
    *,
    use_llm_judge: bool = True,
    judge_model: str = "gpt-4o-mini",
) -> EvalScores:
    """Execute one eval run for a fixture + model.  Mode is read from the fixture."""
    from shared.config import settings
    from shared.ws_context import set_current_user, clear_current_user

    seed = copy.deepcopy(fixture.seed_records)
    mock = MockExecutor(
        fixture_dir=fixture.fixture_path.parent,
        seed_records=seed,
    )

    original_model = settings.LLM_MODEL
    settings.LLM_MODEL = model
    eval_user_id = str(uuid.uuid4())

    logger.info(
        "eval: starting %s | mode=%s | model=%s",
        fixture.name, fixture.mode, model,
    )
    start_time = time.time()

    step1_results: list[dict[str, Any]] = []
    step1_reasoning = ""

    try:
        set_current_user(eval_user_id)

        if fixture.mode == "step1":
            with mock.patch():
                step1_results = await _run_step1(fixture, model, mock)

        elif fixture.mode == "step2":
            with mock.patch():
                await _run_step2(fixture, model, mock)

        elif fixture.mode == "full":
            with mock.patch():
                # Step 1 — classification (independent from run_local_agent)
                if fixture.expected_classification:
                    step1_results = await _run_step1(fixture, model, mock)

            # Step 2 — full pipeline (run_local_agent handles both steps)
            await _run_full(fixture, model, mock, eval_user_id)

    except Exception as exc:
        logger.error("eval: pipeline failed for %s/%s: %s", fixture.name, model, exc)
    finally:
        settings.LLM_MODEL = original_model
        clear_current_user()

    elapsed = time.time() - start_time
    logger.info("eval: completed in %.1fs — %d mutations", elapsed, len(mock.mutations))

    # ── Score ─────────────────────────────────────────────────────

    if fixture.mode == "step1":
        s1_precision, s1_recall, s1_f1, step1_reasoning = _score_step1(fixture, step1_results)
        scores = EvalScores(
            fixture_name=fixture.name,
            model=model,
            prompt_variant=fixture.mode,
            precision=s1_precision,
            recall=s1_recall,
            f1=s1_f1,
            llm_judge_reasoning=step1_reasoning,
        )
    else:
        # step2 or full — score mutations
        field_scores, precision, recall, f1, extra, missing = _score_mutations(fixture, mock)
        scores = EvalScores(
            fixture_name=fixture.name,
            model=model,
            prompt_variant=fixture.mode,
            field_scores=field_scores,
            precision=precision,
            recall=recall,
            f1=f1,
            extra_records=extra,
            missing_records=missing,
        )

        # Add step1 classification scores for full mode
        if fixture.mode == "full" and fixture.expected_classification:
            s1_p, s1_r, s1_f1, step1_reasoning = _score_step1(fixture, step1_results)
            scores.llm_judge_reasoning = f"Step1 classification:\n{step1_reasoning}"

        # Optional LLM judge for extraction quality
        if use_llm_judge and fixture.expected:
            all_expected = [r.fields for r in fixture.expected]
            all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")]
            judge_score, reasoning = await llm_judge_score(
                all_expected, all_actual, judge_model=judge_model,
            )
            scores.llm_judge_score = judge_score
            if step1_reasoning:
                scores.llm_judge_reasoning += f"\n\nLLM judge:\n{reasoning}"
            else:
                scores.llm_judge_reasoning = reasoning

    # ── Report to Langfuse ────────────────────────────────────────
    prompt_names = {
        "step1": ["batch_file_classifier"],
        "step2": ["batch_processing"],
        "full": ["batch_file_classifier", "batch_processing"],
    }.get(fixture.mode, ["batch_processing"])

    trace_id = langfuse_eval.log_eval_trace(
        fixture_name=fixture.name,
        model=model,
        prompt_variant=fixture.mode,
        prompt_template=fixture.custom_prompt_section or "(default)",
        actual_mutations=[{"action": m.action, "table": m.table, "data": m.data} for m in mock.mutations],
        scores_summary=scores.summary(),
        step1_results=step1_results or None,
        langfuse_prompt_names=prompt_names,
    )

    if trace_id:
        langfuse_eval.post_eval_scores(scores, trace_id=trace_id)

        # For full mode, post classification scores separately
        if fixture.mode == "full" and fixture.expected_classification:
            s1_p, s1_r, s1_f1, _ = _score_step1(fixture, step1_results)
            for name, value in [
                ("classification_precision", s1_p),
                ("classification_recall", s1_r),
                ("classification_f1", s1_f1),
            ]:
                try:
                    from langfuse import get_client
                    lf = get_client()
                    if lf:
                        lf.create_score(
                            name=name,
                            value=value,
                            trace_id=trace_id,
                            data_type="NUMERIC",
                            comment=f"{fixture.name} | {model} | full",
                        )
                except Exception:
                    pass

    return scores


async def run_fixture_eval(
    fixture: EvalFixture,
    models: list[str],
    *,
    use_llm_judge: bool = True,
    judge_model: str = "gpt-4o-mini",
) -> list[EvalScores]:
    """Run all models for a fixture."""
    langfuse_eval.sync_fixture_to_dataset(fixture)

    results: list[EvalScores] = []
    for model in models:
        scores = await run_single_eval(
            fixture, model,
            use_llm_judge=use_llm_judge,
            judge_model=judge_model,
        )
        results.append(scores)

    return results


def print_results(results: list[EvalScores]) -> None:
    """Print a formatted summary table of eval results."""
    if not results:
        print("\nNo eval results.")
        return

    print("\n" + "=" * 95)
    print(f"{'Fixture':<25} {'Mode':<6} {'Model':<25} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}")
    print("-" * 95)

    for s in results:
        llm_str = f"{s.llm_judge_score:.2f}" if s.llm_judge_score is not None else "  --"
        print(
            f"{s.fixture_name:<25} {s.prompt_variant:<6} {s.model:<25} "
            f"{s.precision:>6.2f} {s.recall:>6.2f} {s.f1:>6.2f} "
            f"{s.field_accuracy:>6.2f} {llm_str:>6}"
        )

    print("=" * 95)
    print()

    print("=" * 90)

    # If LLM judge reasoning is available, print it
    for s in results:
        if s.llm_judge_reasoning:
            print(f"\n[{s.model} / {s.prompt_variant}] LLM Judge: {s.llm_judge_reasoning}")

    print()